summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp1077
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h68
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp2951
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h39
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h543
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp1009
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h112
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp1909
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h675
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h503
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp289
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h142
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp820
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h25
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp257
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h162
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp855
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h779
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp262
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp141
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h34
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp172
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h61
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp1506
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp451
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h129
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp119
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp605
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp97
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp60
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp22
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp464
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h122
-rw-r--r--contrib/llvm/lib/Target/X86/X86.h76
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td787
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp706
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.h131
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp558
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.h107
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.td881
-rw-r--r--contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp198
-rw-r--r--contrib/llvm/lib/Target/X86/X86FastISel.cpp3607
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp410
-rw-r--r--contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp1651
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.cpp2698
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.h203
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp3012
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp28765
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.h1157
-rw-r--r--contrib/llvm/lib/Target/X86/X86Instr3DNow.td103
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrAVX512.td7519
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrArithmetic.td1375
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrBuilder.h183
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td112
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCompiler.td1864
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrControl.td329
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrExtension.td182
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA.td441
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFPStack.td729
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFormats.td948
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td1037
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp7330
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.h571
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.td3085
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMMX.td674
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMPX.td70
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSGX.td24
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td8944
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSVM.td62
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td969
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSystem.td615
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrTSX.td50
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrVMX.td66
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrXOP.td344
-rw-r--r--contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h2042
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCInstLower.cpp1459
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp33
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h167
-rw-r--r--contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp326
-rw-r--r--contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp213
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp639
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.h143
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.td497
-rw-r--r--contrib/llvm/lib/Target/X86/X86SchedHaswell.td2147
-rw-r--r--contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td250
-rw-r--r--contrib/llvm/lib/Target/X86/X86Schedule.td650
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleAtom.td549
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td341
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleSLM.td233
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp284
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h52
-rw-r--r--contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp190
-rw-r--r--contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h45
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.cpp343
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h546
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.cpp280
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.h49
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp175
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.h67
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp1487
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h109
-rw-r--r--contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp320
-rw-r--r--contrib/llvm/lib/Target/X86/X86WinEHState.cpp456
103 files changed, 113125 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
new file mode 100644
index 0000000..09cc53a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -0,0 +1,1077 @@
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86Operand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+// Following comment describes how assembly instrumentation works.
+// Currently we have only AddressSanitizer instrumentation, but we're
+// planning to implement MemorySanitizer for inline assembly too. If
+// you're not familiar with AddressSanitizer algorithm, please, read
+// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm.
+//
+// When inline assembly is parsed by an instance of X86AsmParser, all
+// instructions are emitted via EmitInstruction method. That's the
+// place where X86AsmInstrumentation analyzes an instruction and
+// decides, whether the instruction should be emitted as is or
+// instrumentation is required. The latter case happens when an
+// instruction reads from or writes to memory. Now instruction opcode
+// is explicitly checked, and if an instruction has a memory operand
+// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
+// instrumented. There're also exist instructions that modify
+// memory but don't have an explicit memory operands, for instance,
+// movs.
+//
+// Let's consider at first 8-byte memory accesses when an instruction
+// has an explicit memory operand. In this case we need two registers -
+// AddressReg to compute address of a memory cells which are accessed
+// and ShadowReg to compute corresponding shadow address. So, we need
+// to spill both registers before instrumentation code and restore them
+// after instrumentation. Thus, in general, instrumentation code will
+// look like this:
+// PUSHF # Store flags, otherwise they will be overwritten
+// PUSH AddressReg # spill AddressReg
+// PUSH ShadowReg # spill ShadowReg
+// LEA MemOp, AddressReg # compute address of the memory operand
+// MOV AddressReg, ShadowReg
+// SHR ShadowReg, 3
+// # ShadowOffset(AddressReg >> 3) contains address of a shadow
+// # corresponding to MemOp.
+// CMP ShadowOffset(ShadowReg), 0 # test shadow value
+// JZ .Done # when shadow equals to zero, everything is fine
+// MOV AddressReg, RDI
+// # Call __asan_report function with AddressReg as an argument
+// CALL __asan_report
+// .Done:
+// POP ShadowReg # Restore ShadowReg
+// POP AddressReg # Restore AddressReg
+// POPF # Restore flags
+//
+// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
+// handled in a similar manner, but small memory accesses (less than 8
+// byte) require an additional ScratchReg, which is used for shadow value.
+//
+// If, suppose, we're instrumenting an instruction like movs, only
+// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
+// RCX are checked. In this case there're no need to spill and restore
+// AddressReg , ShadowReg or flags four times, they're saved on stack
+// just once, before instrumentation of these four addresses, and restored
+// at the end of the instrumentation.
+//
+// There exist several things which complicate this simple algorithm.
+// * Instrumented memory operand can have RSP as a base or an index
+// register. So we need to add a constant offset before computation
+// of memory address, since flags, AddressReg, ShadowReg, etc. were
+// already stored on stack and RSP was modified.
+// * Debug info (usually, DWARF) should be adjusted, because sometimes
+// RSP is used as a frame register. So, we need to select some
+// register as a frame register and temprorary override current CFA
+// register.
+
+namespace llvm {
+namespace {
+
+static cl::opt<bool> ClAsanInstrumentAssembly(
+ "asan-instrument-assembly",
+ cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+ cl::init(false));
+
+const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
+const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+
+int64_t ApplyDisplacementBounds(int64_t Displacement) {
+ return std::max(std::min(MaxAllowedDisplacement, Displacement),
+ MinAllowedDisplacement);
+}
+
+void CheckDisplacementBounds(int64_t Displacement) {
+ assert(Displacement >= MinAllowedDisplacement &&
+ Displacement <= MaxAllowedDisplacement);
+}
+
+bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+
+bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+class X86AddressSanitizer : public X86AsmInstrumentation {
+public:
+ struct RegisterContext {
+ private:
+ enum RegOffset {
+ REG_OFFSET_ADDRESS = 0,
+ REG_OFFSET_SHADOW,
+ REG_OFFSET_SCRATCH
+ };
+
+ public:
+ RegisterContext(unsigned AddressReg, unsigned ShadowReg,
+ unsigned ScratchReg) {
+ BusyRegs.push_back(convReg(AddressReg, 64));
+ BusyRegs.push_back(convReg(ShadowReg, 64));
+ BusyRegs.push_back(convReg(ScratchReg, 64));
+ }
+
+ unsigned AddressReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
+ }
+
+ unsigned ShadowReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
+ }
+
+ unsigned ScratchReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
+ }
+
+ void AddBusyReg(unsigned Reg) {
+ if (Reg != X86::NoRegister)
+ BusyRegs.push_back(convReg(Reg, 64));
+ }
+
+ void AddBusyRegs(const X86Operand &Op) {
+ AddBusyReg(Op.getMemBaseReg());
+ AddBusyReg(Op.getMemIndexReg());
+ }
+
+ unsigned ChooseFrameReg(unsigned Size) const {
+ static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
+ X86::RCX, X86::RDX, X86::RDI,
+ X86::RSI };
+ for (unsigned Reg : Candidates) {
+ if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
+ return convReg(Reg, Size);
+ }
+ return X86::NoRegister;
+ }
+
+ private:
+ unsigned convReg(unsigned Reg, unsigned Size) const {
+ return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
+ }
+
+ std::vector<unsigned> BusyRegs;
+ };
+
+ X86AddressSanitizer(const MCSubtargetInfo *&STI)
+ : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
+
+ ~X86AddressSanitizer() override {}
+
+ // X86AsmInstrumentation implementation:
+ void InstrumentAndEmitInstruction(const MCInst &Inst,
+ OperandVector &Operands,
+ MCContext &Ctx,
+ const MCInstrInfo &MII,
+ MCStreamer &Out) override {
+ InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
+ if (RepPrefix)
+ EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
+
+ InstrumentMOV(Inst, Operands, Ctx, MII, Out);
+
+ RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
+ if (!RepPrefix)
+ EmitInstruction(Out, Inst);
+ }
+
+ // Adjusts up stack and saves all registers used in instrumentation.
+ virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) = 0;
+
+ // Restores all registers used in instrumentation and adjusts stack.
+ virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) = 0;
+
+ virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx, MCStreamer &Out) = 0;
+ virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx, MCStreamer &Out) = 0;
+
+ virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) = 0;
+
+ void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx,
+ MCStreamer &Out);
+ void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
+ unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
+
+ void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+ void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+ void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
+ void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
+ assert(Size == 32 || Size == 64);
+ MCInst Inst;
+ Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
+ Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
+ Op.addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
+ unsigned Reg, MCContext &Ctx, MCStreamer &Out);
+
+ // Creates new memory operand with Displacement added to an original
+ // displacement. Residue will contain a residue which could happen when the
+ // total displacement exceeds 32-bit limitation.
+ std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
+ int64_t Displacement,
+ MCContext &Ctx, int64_t *Residue);
+
+ bool is64BitMode() const {
+ return STI->getFeatureBits()[X86::Mode64Bit];
+ }
+ bool is32BitMode() const {
+ return STI->getFeatureBits()[X86::Mode32Bit];
+ }
+ bool is16BitMode() const {
+ return STI->getFeatureBits()[X86::Mode16Bit];
+ }
+
+ unsigned getPointerWidth() {
+ if (is16BitMode()) return 16;
+ if (is32BitMode()) return 32;
+ if (is64BitMode()) return 64;
+ llvm_unreachable("invalid mode");
+ }
+
+ // True when previous instruction was actually REP prefix.
+ bool RepPrefix;
+
+ // Offset from the original SP register.
+ int64_t OrigSPOffset;
+};
+
+void X86AddressSanitizer::InstrumentMemOperand(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ assert(Op.isMem() && "Op should be a memory operand.");
+ assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
+ "AccessSize should be a power of two, less or equal than 16.");
+ // FIXME: take into account load/store alignment.
+ if (IsSmallMemAccess(AccessSize))
+ InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+ else
+ InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
+ unsigned CntReg,
+ unsigned AccessSize,
+ MCContext &Ctx, MCStreamer &Out) {
+ // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
+ // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
+ RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
+ IsSmallMemAccess(AccessSize)
+ ? X86::RBX
+ : X86::NoRegister /* ScratchReg */);
+ RegCtx.AddBusyReg(DstReg);
+ RegCtx.AddBusyReg(SrcReg);
+ RegCtx.AddBusyReg(CntReg);
+
+ InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+
+ // Test (%SrcReg)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+ Out);
+ }
+
+ // Test -1(%SrcReg, %CntReg, AccessSize)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
+ SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+ Out);
+ }
+
+ // Test (%DstReg)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+ }
+
+ // Test -1(%DstReg, %CntReg, AccessSize)
+ {
+ const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+ std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+ getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
+ SMLoc()));
+ InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+ }
+
+ InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
+ OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII,
+ MCStreamer &Out) {
+ // Access size in bytes.
+ unsigned AccessSize = 0;
+
+ switch (Inst.getOpcode()) {
+ case X86::MOVSB:
+ AccessSize = 1;
+ break;
+ case X86::MOVSW:
+ AccessSize = 2;
+ break;
+ case X86::MOVSL:
+ AccessSize = 4;
+ break;
+ case X86::MOVSQ:
+ AccessSize = 8;
+ break;
+ default:
+ return;
+ }
+
+ InstrumentMOVSImpl(AccessSize, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
+ OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII,
+ MCStreamer &Out) {
+ // Access size in bytes.
+ unsigned AccessSize = 0;
+
+ switch (Inst.getOpcode()) {
+ case X86::MOV8mi:
+ case X86::MOV8mr:
+ case X86::MOV8rm:
+ AccessSize = 1;
+ break;
+ case X86::MOV16mi:
+ case X86::MOV16mr:
+ case X86::MOV16rm:
+ AccessSize = 2;
+ break;
+ case X86::MOV32mi:
+ case X86::MOV32mr:
+ case X86::MOV32rm:
+ AccessSize = 4;
+ break;
+ case X86::MOV64mi32:
+ case X86::MOV64mr:
+ case X86::MOV64rm:
+ AccessSize = 8;
+ break;
+ case X86::MOVAPDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDrm:
+ case X86::MOVAPSrm:
+ AccessSize = 16;
+ break;
+ default:
+ return;
+ }
+
+ const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+
+ for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+ assert(Operands[Ix]);
+ MCParsedAsmOperand &Op = *Operands[Ix];
+ if (Op.isMem()) {
+ X86Operand &MemOp = static_cast<X86Operand &>(Op);
+ RegisterContext RegCtx(
+ X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
+ IsSmallMemAccess(AccessSize) ? X86::RCX
+ : X86::NoRegister /* ScratchReg */);
+ RegCtx.AddBusyRegs(MemOp);
+ InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+ InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
+ InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+ }
+ }
+}
+
+void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
+ unsigned Size,
+ unsigned Reg, MCContext &Ctx,
+ MCStreamer &Out) {
+ int64_t Displacement = 0;
+ if (IsStackReg(Op.getMemBaseReg()))
+ Displacement -= OrigSPOffset;
+ if (IsStackReg(Op.getMemIndexReg()))
+ Displacement -= OrigSPOffset * Op.getMemScale();
+
+ assert(Displacement >= 0);
+
+ // Emit Op as is.
+ if (Displacement == 0) {
+ EmitLEA(Op, Size, Reg, Out);
+ return;
+ }
+
+ int64_t Residue;
+ std::unique_ptr<X86Operand> NewOp =
+ AddDisplacement(Op, Displacement, Ctx, &Residue);
+ EmitLEA(*NewOp, Size, Reg, Out);
+
+ while (Residue != 0) {
+ const MCConstantExpr *Disp =
+ MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
+ std::unique_ptr<X86Operand> DispOp =
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
+ SMLoc());
+ EmitLEA(*DispOp, Size, Reg, Out);
+ Residue -= Disp->getValue();
+ }
+}
+
+std::unique_ptr<X86Operand>
+X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
+ MCContext &Ctx, int64_t *Residue) {
+ assert(Displacement >= 0);
+
+ if (Displacement == 0 ||
+ (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
+ *Residue = Displacement;
+ return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
+ Op.getMemDisp(), Op.getMemBaseReg(),
+ Op.getMemIndexReg(), Op.getMemScale(),
+ SMLoc(), SMLoc());
+ }
+
+ int64_t OrigDisplacement =
+ static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
+ CheckDisplacementBounds(OrigDisplacement);
+ Displacement += OrigDisplacement;
+
+ int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
+ CheckDisplacementBounds(NewDisplacement);
+
+ *Residue = Displacement - NewDisplacement;
+ const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
+ return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
+ Op.getMemBaseReg(), Op.getMemIndexReg(),
+ Op.getMemScale(), SMLoc(), SMLoc());
+}
+
+class X86AddressSanitizer32 : public X86AddressSanitizer {
+public:
+ static const long kShadowOffset = 0x20000000;
+
+ X86AddressSanitizer32(const MCSubtargetInfo *&STI)
+ : X86AddressSanitizer(STI) {}
+
+ ~X86AddressSanitizer32() override {}
+
+ unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+ unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+ if (FrameReg == X86::NoRegister)
+ return FrameReg;
+ return getX86SubSuperRegister(FrameReg, 32);
+ }
+
+ void SpillReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
+ OrigSPOffset -= 4;
+ }
+
+ void RestoreReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
+ OrigSPOffset += 4;
+ }
+
+ void StoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+ OrigSPOffset -= 4;
+ }
+
+ void RestoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+ OrigSPOffset += 4;
+ }
+
+ void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (MRI && FrameReg != X86::NoRegister) {
+ SpillReg(Out, LocalFrameReg);
+ if (FrameReg == X86::ESP) {
+ Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
+ Out.EmitCFIRelOffset(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+ }
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
+ Out.EmitCFIRememberState();
+ Out.EmitCFIDefCfaRegister(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+ }
+
+ SpillReg(Out, RegCtx.AddressReg(32));
+ SpillReg(Out, RegCtx.ShadowReg(32));
+ if (RegCtx.ScratchReg(32) != X86::NoRegister)
+ SpillReg(Out, RegCtx.ScratchReg(32));
+ StoreFlags(Out);
+ }
+
+ void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ RestoreFlags(Out);
+ if (RegCtx.ScratchReg(32) != X86::NoRegister)
+ RestoreReg(Out, RegCtx.ScratchReg(32));
+ RestoreReg(Out, RegCtx.ShadowReg(32));
+ RestoreReg(Out, RegCtx.AddressReg(32));
+
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+ RestoreReg(Out, LocalFrameReg);
+ Out.EmitCFIRestoreState();
+ if (FrameReg == X86::ESP)
+ Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
+ }
+ }
+
+ void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) override;
+
+private:
+ void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out, const RegisterContext &RegCtx) {
+ EmitInstruction(Out, MCInstBuilder(X86::CLD));
+ EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+ EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+ .addReg(X86::ESP)
+ .addReg(X86::ESP)
+ .addImm(-16));
+ EmitInstruction(
+ Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
+
+ MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+ (IsWrite ? "store" : "load") +
+ llvm::Twine(AccessSize));
+ const MCSymbolRefExpr *FnExpr =
+ MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+ }
+};
+
+void X86AddressSanitizer32::InstrumentMemOperandSmall(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+ unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+ assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+ unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+ ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+ .addReg(ShadowRegI32)
+ .addReg(ShadowRegI32)
+ .addImm(3));
+
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::MOV8rm);
+ Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(7));
+
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 1:
+ break;
+ case 2: {
+ const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ EmitLEA(*Op, 32, ScratchRegI32, Out);
+ break;
+ }
+ case 4:
+ EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(3));
+ break;
+ }
+
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+ EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+ ShadowRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLarge(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+
+ ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+ .addReg(ShadowRegI32)
+ .addReg(ShadowRegI32)
+ .addImm(3));
+ {
+ MCInst Inst;
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 8:
+ Inst.setOpcode(X86::CMP8mi);
+ break;
+ case 16:
+ Inst.setOpcode(X86::CMP16mi);
+ break;
+ }
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ Inst.addOperand(MCOperand::createImm(0));
+ EmitInstruction(Out, Inst);
+ }
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
+ MCContext &Ctx,
+ MCStreamer &Out) {
+ StoreFlags(Out);
+
+ // No need to test when ECX is equals to zero.
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ // Instrument first and last elements in src and dst range.
+ InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
+ X86::ECX /* CntReg */, AccessSize, Ctx, Out);
+
+ EmitLabel(Out, DoneSym);
+ RestoreFlags(Out);
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+ static const long kShadowOffset = 0x7fff8000;
+
+ X86AddressSanitizer64(const MCSubtargetInfo *&STI)
+ : X86AddressSanitizer(STI) {}
+
+ ~X86AddressSanitizer64() override {}
+
+ unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+ unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+ if (FrameReg == X86::NoRegister)
+ return FrameReg;
+ return getX86SubSuperRegister(FrameReg, 64);
+ }
+
+ void SpillReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
+ OrigSPOffset -= 8;
+ }
+
+ void RestoreReg(MCStreamer &Out, unsigned Reg) {
+ EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
+ OrigSPOffset += 8;
+ }
+
+ void StoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+ OrigSPOffset -= 8;
+ }
+
+ void RestoreFlags(MCStreamer &Out) {
+ EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+ OrigSPOffset += 8;
+ }
+
+ void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (MRI && FrameReg != X86::NoRegister) {
+ SpillReg(Out, X86::RBP);
+ if (FrameReg == X86::RSP) {
+ Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
+ Out.EmitCFIRelOffset(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+ }
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
+ Out.EmitCFIRememberState();
+ Out.EmitCFIDefCfaRegister(
+ MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+ }
+
+ EmitAdjustRSP(Ctx, Out, -128);
+ SpillReg(Out, RegCtx.ShadowReg(64));
+ SpillReg(Out, RegCtx.AddressReg(64));
+ if (RegCtx.ScratchReg(64) != X86::NoRegister)
+ SpillReg(Out, RegCtx.ScratchReg(64));
+ StoreFlags(Out);
+ }
+
+ void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+ assert(LocalFrameReg != X86::NoRegister);
+
+ RestoreFlags(Out);
+ if (RegCtx.ScratchReg(64) != X86::NoRegister)
+ RestoreReg(Out, RegCtx.ScratchReg(64));
+ RestoreReg(Out, RegCtx.AddressReg(64));
+ RestoreReg(Out, RegCtx.ShadowReg(64));
+ EmitAdjustRSP(Ctx, Out, 128);
+
+ unsigned FrameReg = GetFrameReg(Ctx, Out);
+ if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+ RestoreReg(Out, LocalFrameReg);
+ Out.EmitCFIRestoreState();
+ if (FrameReg == X86::RSP)
+ Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
+ }
+ }
+
+ void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) override;
+
+private:
+ void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
+ const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
+ SMLoc(), SMLoc()));
+ EmitLEA(*Op, 64, X86::RSP, Out);
+ OrigSPOffset += Offset;
+ }
+
+ void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+ MCStreamer &Out, const RegisterContext &RegCtx) {
+ EmitInstruction(Out, MCInstBuilder(X86::CLD));
+ EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+ EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+ .addReg(X86::RSP)
+ .addReg(X86::RSP)
+ .addImm(-16));
+
+ if (RegCtx.AddressReg(64) != X86::RDI) {
+ EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
+ RegCtx.AddressReg(64)));
+ }
+ MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+ (IsWrite ? "store" : "load") +
+ llvm::Twine(AccessSize));
+ const MCSymbolRefExpr *FnExpr =
+ MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+ }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmall(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI64 = RegCtx.AddressReg(64);
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+ unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+ assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+ unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+ ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+ AddressRegI64));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+ .addReg(ShadowRegI64)
+ .addReg(ShadowRegI64)
+ .addImm(3));
+ {
+ MCInst Inst;
+ Inst.setOpcode(X86::MOV8rm);
+ Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ EmitInstruction(Out, Inst);
+ }
+
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+ AddressRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(7));
+
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 1:
+ break;
+ case 2: {
+ const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+ SMLoc(), SMLoc()));
+ EmitLEA(*Op, 32, ScratchRegI32, Out);
+ break;
+ }
+ case 4:
+ EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+ .addReg(ScratchRegI32)
+ .addReg(ScratchRegI32)
+ .addImm(3));
+ break;
+ }
+
+ EmitInstruction(
+ Out,
+ MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+ EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+ ShadowRegI32));
+ EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLarge(
+ X86Operand &Op, unsigned AccessSize, bool IsWrite,
+ const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+ unsigned AddressRegI64 = RegCtx.AddressReg(64);
+ unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+
+ ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+ EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+ AddressRegI64));
+ EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+ .addReg(ShadowRegI64)
+ .addReg(ShadowRegI64)
+ .addImm(3));
+ {
+ MCInst Inst;
+ switch (AccessSize) {
+ default: llvm_unreachable("Incorrect access size");
+ case 8:
+ Inst.setOpcode(X86::CMP8mi);
+ break;
+ case 16:
+ Inst.setOpcode(X86::CMP16mi);
+ break;
+ }
+ const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+ std::unique_ptr<X86Operand> Op(
+ X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+ SMLoc(), SMLoc()));
+ Op->addMemOperands(Inst, 5);
+ Inst.addOperand(MCOperand::createImm(0));
+ EmitInstruction(Out, Inst);
+ }
+
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+ EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
+ MCContext &Ctx,
+ MCStreamer &Out) {
+ StoreFlags(Out);
+
+ // No need to test when RCX is equals to zero.
+ MCSymbol *DoneSym = Ctx.createTempSymbol();
+ const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+ EmitInstruction(
+ Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
+ EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+ // Instrument first and last elements in src and dst range.
+ InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
+ X86::RCX /* CntReg */, AccessSize, Ctx, Out);
+
+ EmitLabel(Out, DoneSym);
+ RestoreFlags(Out);
+}
+
+} // End anonymous namespace
+
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
+ : STI(STI), InitialFrameReg(0) {}
+
+X86AsmInstrumentation::~X86AsmInstrumentation() {}
+
+void X86AsmInstrumentation::InstrumentAndEmitInstruction(
+ const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+ const MCInstrInfo &MII, MCStreamer &Out) {
+ EmitInstruction(Out, Inst);
+}
+
+void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
+ const MCInst &Inst) {
+ Out.EmitInstruction(Inst, *STI);
+}
+
+unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
+ MCStreamer &Out) {
+ if (!Out.getNumFrameInfos()) // No active dwarf frame
+ return X86::NoRegister;
+ const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
+ if (Frame.End) // Active dwarf frame is closed
+ return X86::NoRegister;
+ const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+ if (!MRI) // No register info
+ return X86::NoRegister;
+
+ if (InitialFrameReg) {
+ // FrameReg is set explicitly, we're instrumenting a MachineFunction.
+ return InitialFrameReg;
+ }
+
+ return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
+}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+ Triple T(STI->getTargetTriple());
+ const bool hasCompilerRTSupport = T.isOSLinux();
+ if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+ MCOptions.SanitizeAddress) {
+ if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
+ return new X86AddressSanitizer32(STI);
+ if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
+ return new X86AddressSanitizer64(STI);
+ }
+ return new X86AsmInstrumentation(STI);
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
new file mode 100644
index 0000000..470cead
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -0,0 +1,68 @@
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCInstrInfo;
+class MCParsedAsmOperand;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+
+class X86AsmInstrumentation;
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx,
+ const MCSubtargetInfo *&STI);
+
+class X86AsmInstrumentation {
+public:
+ virtual ~X86AsmInstrumentation();
+
+ // Sets frame register corresponding to a current frame.
+ void SetInitialFrameRegister(unsigned RegNo) {
+ InitialFrameReg = RegNo;
+ }
+
+ // Tries to instrument and emit instruction.
+ virtual void InstrumentAndEmitInstruction(
+ const MCInst &Inst,
+ SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+ friend X86AsmInstrumentation *
+ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+ const MCContext &Ctx,
+ const MCSubtargetInfo *&STI);
+
+ X86AsmInstrumentation(const MCSubtargetInfo *&STI);
+
+ unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
+
+ void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
+
+ const MCSubtargetInfo *&STI;
+
+ unsigned InitialFrameReg;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..4d8ffac
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,2951 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+static const char OpPrecedence[] = {
+ 0, // IC_OR
+ 1, // IC_XOR
+ 2, // IC_AND
+ 3, // IC_LSHIFT
+ 3, // IC_RSHIFT
+ 4, // IC_PLUS
+ 4, // IC_MINUS
+ 5, // IC_MULTIPLY
+ 5, // IC_DIVIDE
+ 6, // IC_RPAREN
+ 7, // IC_LPAREN
+ 0, // IC_IMM
+ 0 // IC_REGISTER
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+ const MCInstrInfo &MII;
+ ParseInstructionInfo *InstInfo;
+ std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+
+private:
+ SMLoc consumeToken() {
+ MCAsmParser &Parser = getParser();
+ SMLoc Result = Parser.getTok().getLoc();
+ Parser.Lex();
+ return Result;
+ }
+
+ enum InfixCalculatorTok {
+ IC_OR = 0,
+ IC_XOR,
+ IC_AND,
+ IC_LSHIFT,
+ IC_RSHIFT,
+ IC_PLUS,
+ IC_MINUS,
+ IC_MULTIPLY,
+ IC_DIVIDE,
+ IC_RPAREN,
+ IC_LPAREN,
+ IC_IMM,
+ IC_REGISTER
+ };
+
+ class InfixCalculator {
+ typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+ SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+ SmallVector<ICToken, 4> PostfixStack;
+
+ public:
+ int64_t popOperand() {
+ assert (!PostfixStack.empty() && "Poped an empty stack!");
+ ICToken Op = PostfixStack.pop_back_val();
+ assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+ && "Expected and immediate or register!");
+ return Op.second;
+ }
+ void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+ assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+ "Unexpected operand!");
+ PostfixStack.push_back(std::make_pair(Op, Val));
+ }
+
+ void popOperator() { InfixOperatorStack.pop_back(); }
+ void pushOperator(InfixCalculatorTok Op) {
+ // Push the new operator if the stack is empty.
+ if (InfixOperatorStack.empty()) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // Push the new operator if it has a higher precedence than the operator
+ // on the top of the stack or the operator on the top of the stack is a
+ // left parentheses.
+ unsigned Idx = InfixOperatorStack.size() - 1;
+ InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+ if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // The operator on the top of the stack has higher precedence than the
+ // new operator.
+ unsigned ParenCount = 0;
+ while (1) {
+ // Nothing to process.
+ if (InfixOperatorStack.empty())
+ break;
+
+ Idx = InfixOperatorStack.size() - 1;
+ StackOp = InfixOperatorStack[Idx];
+ if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+ break;
+
+ // If we have an even parentheses count and we see a left parentheses,
+ // then stop processing.
+ if (!ParenCount && StackOp == IC_LPAREN)
+ break;
+
+ if (StackOp == IC_RPAREN) {
+ ++ParenCount;
+ InfixOperatorStack.pop_back();
+ } else if (StackOp == IC_LPAREN) {
+ --ParenCount;
+ InfixOperatorStack.pop_back();
+ } else {
+ InfixOperatorStack.pop_back();
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+ }
+ // Push the new operator.
+ InfixOperatorStack.push_back(Op);
+ }
+
+ int64_t execute() {
+ // Push any remaining operators onto the postfix stack.
+ while (!InfixOperatorStack.empty()) {
+ InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+ if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+
+ if (PostfixStack.empty())
+ return 0;
+
+ SmallVector<ICToken, 16> OperandStack;
+ for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+ ICToken Op = PostfixStack[i];
+ if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+ OperandStack.push_back(Op);
+ } else {
+ assert (OperandStack.size() > 1 && "Too few operands.");
+ int64_t Val;
+ ICToken Op2 = OperandStack.pop_back_val();
+ ICToken Op1 = OperandStack.pop_back_val();
+ switch (Op.first) {
+ default:
+ report_fatal_error("Unexpected operator!");
+ break;
+ case IC_PLUS:
+ Val = Op1.second + Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MINUS:
+ Val = Op1.second - Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MULTIPLY:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Multiply operation with an immediate and a register!");
+ Val = Op1.second * Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_DIVIDE:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Divide operation with an immediate and a register!");
+ assert (Op2.second != 0 && "Division by zero!");
+ Val = Op1.second / Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_OR:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Or operation with an immediate and a register!");
+ Val = Op1.second | Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_XOR:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Xor operation with an immediate and a register!");
+ Val = Op1.second ^ Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_AND:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "And operation with an immediate and a register!");
+ Val = Op1.second & Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LSHIFT:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Left shift operation with an immediate and a register!");
+ Val = Op1.second << Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_RSHIFT:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Right shift operation with an immediate and a register!");
+ Val = Op1.second >> Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ }
+ }
+ }
+ assert (OperandStack.size() == 1 && "Expected a single result.");
+ return OperandStack.pop_back_val().second;
+ }
+ };
+
+ enum IntelExprState {
+ IES_OR,
+ IES_XOR,
+ IES_AND,
+ IES_LSHIFT,
+ IES_RSHIFT,
+ IES_PLUS,
+ IES_MINUS,
+ IES_NOT,
+ IES_MULTIPLY,
+ IES_DIVIDE,
+ IES_LBRAC,
+ IES_RBRAC,
+ IES_LPAREN,
+ IES_RPAREN,
+ IES_REGISTER,
+ IES_INTEGER,
+ IES_IDENTIFIER,
+ IES_ERROR
+ };
+
+ class IntelExprStateMachine {
+ IntelExprState State, PrevState;
+ unsigned BaseReg, IndexReg, TmpReg, Scale;
+ int64_t Imm;
+ const MCExpr *Sym;
+ StringRef SymName;
+ bool StopOnLBrac, AddImmPrefix;
+ InfixCalculator IC;
+ InlineAsmIdentifierInfo Info;
+
+ public:
+ IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+ State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+ Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
+ AddImmPrefix(addimmprefix) { Info.clear(); }
+
+ unsigned getBaseReg() { return BaseReg; }
+ unsigned getIndexReg() { return IndexReg; }
+ unsigned getScale() { return Scale; }
+ const MCExpr *getSym() { return Sym; }
+ StringRef getSymName() { return SymName; }
+ int64_t getImm() { return Imm + IC.execute(); }
+ bool isValidEndState() {
+ return State == IES_RBRAC || State == IES_INTEGER;
+ }
+ bool getStopOnLBrac() { return StopOnLBrac; }
+ bool getAddImmPrefix() { return AddImmPrefix; }
+ bool hadError() { return State == IES_ERROR; }
+
+ InlineAsmIdentifierInfo &getIdentifierInfo() {
+ return Info;
+ }
+
+ void onOr() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_OR;
+ IC.pushOperator(IC_OR);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onXor() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_XOR;
+ IC.pushOperator(IC_XOR);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onAnd() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_AND;
+ IC.pushOperator(IC_AND);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLShift() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LSHIFT;
+ IC.pushOperator(IC_LSHIFT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRShift() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_RSHIFT;
+ IC.pushOperator(IC_RSHIFT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onPlus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onMinus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_NOT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ case IES_RPAREN:
+ case IES_LBRAC:
+ case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_REGISTER:
+ State = IES_MINUS;
+ // Only push the minus operator if it is not a unary operator.
+ if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+ CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+ CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+ IC.pushOperator(IC_MINUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onNot() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_NOT:
+ State = IES_NOT;
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRegister(unsigned Reg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_LPAREN:
+ State = IES_REGISTER;
+ TmpReg = Reg;
+ IC.pushOperand(IC_REGISTER);
+ break;
+ case IES_MULTIPLY:
+ // Index Register - Scale * Register
+ if (PrevState == IES_INTEGER) {
+ assert (!IndexReg && "IndexReg already set!");
+ State = IES_REGISTER;
+ IndexReg = Reg;
+ // Get the scale and replace the 'Scale * Register' with '0'.
+ Scale = IC.popOperand();
+ IC.pushOperand(IC_IMM);
+ IC.popOperator();
+ } else {
+ State = IES_ERROR;
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ State = IES_INTEGER;
+ Sym = SymRef;
+ SymName = SymRefName;
+ IC.pushOperand(IC_IMM);
+ break;
+ }
+ }
+ bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_DIVIDE:
+ case IES_MULTIPLY:
+ case IES_LPAREN:
+ State = IES_INTEGER;
+ if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // Index Register - Register * Scale
+ assert (!IndexReg && "IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = TmpInt;
+ if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+ ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+ return true;
+ }
+ // Get the scale and replace the 'Register * Scale' with '0'.
+ IC.popOperator();
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT || PrevState == IES_XOR) &&
+ CurrState == IES_MINUS) {
+ // Unary minus. No need to pop the minus operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT || PrevState == IES_XOR) &&
+ CurrState == IES_NOT) {
+ // Unary not. No need to pop the not operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
+ } else {
+ IC.pushOperand(IC_IMM, TmpInt);
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ void onStar() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_MULTIPLY;
+ IC.pushOperator(IC_MULTIPLY);
+ break;
+ }
+ }
+ void onDivide() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_DIVIDE;
+ IC.pushOperator(IC_DIVIDE);
+ break;
+ }
+ }
+ void onLBrac() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_RBRAC:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ break;
+ }
+ }
+ void onRBrac() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RBRAC;
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLParen() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ // FIXME: We don't handle this type of unary minus or not, yet.
+ if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_OR || PrevState == IES_AND ||
+ PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+ PrevState == IES_NOT || PrevState == IES_XOR) &&
+ (CurrState == IES_MINUS || CurrState == IES_NOT)) {
+ State = IES_ERROR;
+ break;
+ }
+ State = IES_LPAREN;
+ IC.pushOperator(IC_LPAREN);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRParen() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RPAREN;
+ IC.pushOperator(IC_RPAREN);
+ break;
+ }
+ }
+ };
+
+ bool Error(SMLoc L, const Twine &Msg,
+ ArrayRef<SMRange> Ranges = None,
+ bool MatchingInlineAsm = false) {
+ MCAsmParser &Parser = getParser();
+ if (MatchingInlineAsm) return true;
+ return Parser.Error(L, Msg, Ranges);
+ }
+
+ bool ErrorAndEatStatement(SMLoc L, const Twine &Msg,
+ ArrayRef<SMRange> Ranges = None,
+ bool MatchingInlineAsm = false) {
+ MCAsmParser &Parser = getParser();
+ Parser.eatToEndOfStatement();
+ return Error(L, Msg, Ranges, MatchingInlineAsm);
+ }
+
+ std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
+ Error(Loc, Msg);
+ return nullptr;
+ }
+
+ std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+ std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+ void AddDefaultSrcDestOperands(
+ OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+ std::unique_ptr<X86Operand> ParseOperand();
+ std::unique_ptr<X86Operand> ParseATTOperand();
+ std::unique_ptr<X86Operand> ParseIntelOperand();
+ std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
+ bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+ std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+ std::unique_ptr<X86Operand>
+ ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+ std::unique_ptr<X86Operand>
+ ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
+ std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+ bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+ std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
+ SMLoc Start,
+ int64_t ImmDisp,
+ unsigned Size);
+ bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End);
+
+ std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+ std::unique_ptr<X86Operand>
+ CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+ unsigned IndexReg, unsigned Scale, SMLoc Start,
+ SMLoc End, unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info);
+
+ bool parseDirectiveEven(SMLoc L);
+ bool ParseDirectiveWord(unsigned Size, SMLoc L);
+ bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+ bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+
+ /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+ /// instrumentation around Inst.
+ void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
+ MCStreamer &Out, bool MatchingInlineAsm);
+
+ bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
+ /// doSrcDstMatch - Returns true if operands are matching in their
+ /// word size (%si and %di, %esi and %edi, etc.). Order depends on
+ /// the parsing mode (Intel vs. AT&T).
+ bool doSrcDstMatch(X86Operand &Op1, X86Operand &Op2);
+
+ /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+ /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+ /// \return \c true if no parsing errors occurred, \c false otherwise.
+ bool HandleAVX512Operand(OperandVector &Operands,
+ const MCParsedAsmOperand &Op);
+
+ bool is64BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode64Bit];
+ }
+ bool is32BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode32Bit];
+ }
+ bool is16BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode16Bit];
+ }
+ void SwitchMode(unsigned mode) {
+ MCSubtargetInfo &STI = copySTI();
+ FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+ FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
+ unsigned FB = ComputeAvailableFeatures(
+ STI.ToggleFeature(OldMode.flip(mode)));
+ setAvailableFeatures(FB);
+
+ assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
+ }
+
+ unsigned getPointerWidth() {
+ if (is16BitMode()) return 16;
+ if (is32BitMode()) return 32;
+ if (is64BitMode()) return 64;
+ llvm_unreachable("invalid mode");
+ }
+
+ bool isParsingIntelSyntax() {
+ return getParser().getAssemblerDialect();
+ }
+
+ /// @name Auto-generated Matcher Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+ /// }
+
+public:
+ X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+ const MCInstrInfo &mii, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) {
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ Instrumentation.reset(
+ CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
+ }
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+ void SetFrameRegister(unsigned RegNo) override;
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
+ StringRef &ErrMsg) {
+ // If we have both a base register and an index register make sure they are
+ // both 64-bit or 32-bit registers.
+ // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+ if (BaseReg != 0 && IndexReg != 0) {
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+ IndexReg != X86::RIZ) {
+ ErrMsg = "base register is 64-bit, but index register is not";
+ return true;
+ }
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+ IndexReg != X86::EIZ){
+ ErrMsg = "base register is 32-bit, but index register is not";
+ return true;
+ }
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+ ErrMsg = "base register is 16-bit, but index register is not";
+ return true;
+ }
+ if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
+ IndexReg != X86::SI && IndexReg != X86::DI) ||
+ ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+ IndexReg != X86::BX && IndexReg != X86::BP)) {
+ ErrMsg = "invalid 16-bit base/index register combination";
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2)
+{
+ // Return true and let a normal complaint about bogus operands happen.
+ if (!Op1.isMem() || !Op2.isMem())
+ return true;
+
+ // Actually these might be the other way round if Intel syntax is
+ // being used. It doesn't matter.
+ unsigned diReg = Op1.Mem.BaseReg;
+ unsigned siReg = Op2.Mem.BaseReg;
+
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(siReg))
+ return X86MCRegisterClasses[X86::GR16RegClassID].contains(diReg);
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(siReg))
+ return X86MCRegisterClasses[X86::GR32RegClassID].contains(diReg);
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(siReg))
+ return X86MCRegisterClasses[X86::GR64RegClassID].contains(diReg);
+ // Again, return true and let another error happen.
+ return true;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc, SMLoc &EndLoc) {
+ MCAsmParser &Parser = getParser();
+ RegNo = 0;
+ const AsmToken &PercentTok = Parser.getTok();
+ StartLoc = PercentTok.getLoc();
+
+ // If we encounter a %, ignore it. This code handles registers with and
+ // without the prefix, unprefixed registers can occur in cfi directives.
+ if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+ Parser.Lex(); // Eat percent token.
+
+ const AsmToken &Tok = Parser.getTok();
+ EndLoc = Tok.getEndLoc();
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ RegNo = MatchRegisterName(Tok.getString());
+
+ // If the match failed, try the register name as lowercase.
+ if (RegNo == 0)
+ RegNo = MatchRegisterName(Tok.getString().lower());
+
+ // The "flags" register cannot be referenced directly.
+ // Treat it as an identifier instead.
+ if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+ RegNo = 0;
+
+ if (!is64BitMode()) {
+ // FIXME: This should be done using Requires<Not64BitMode> and
+ // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+ // checked.
+ // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+ // REX prefix.
+ if (RegNo == X86::RIZ ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+ X86II::isX86_64NonExtLowByteReg(RegNo) ||
+ X86II::isX86_64ExtendedReg(RegNo))
+ return Error(StartLoc, "register %"
+ + Tok.getString() + " is only available in 64-bit mode",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+ if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+ RegNo = X86::ST0;
+ Parser.Lex(); // Eat 'st'
+
+ // Check to see if we have '(4)' after %st.
+ if (getLexer().isNot(AsmToken::LParen))
+ return false;
+ // Lex the paren.
+ getParser().Lex();
+
+ const AsmToken &IntTok = Parser.getTok();
+ if (IntTok.isNot(AsmToken::Integer))
+ return Error(IntTok.getLoc(), "expected stack index");
+ switch (IntTok.getIntVal()) {
+ case 0: RegNo = X86::ST0; break;
+ case 1: RegNo = X86::ST1; break;
+ case 2: RegNo = X86::ST2; break;
+ case 3: RegNo = X86::ST3; break;
+ case 4: RegNo = X86::ST4; break;
+ case 5: RegNo = X86::ST5; break;
+ case 6: RegNo = X86::ST6; break;
+ case 7: RegNo = X86::ST7; break;
+ default: return Error(IntTok.getLoc(), "invalid stack index");
+ }
+
+ if (getParser().Lex().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "expected ')'");
+
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat ')'
+ return false;
+ }
+
+ EndLoc = Parser.getTok().getEndLoc();
+
+ // If this is "db[0-7]", match it as an alias
+ // for dr[0-7].
+ if (RegNo == 0 && Tok.getString().size() == 3 &&
+ Tok.getString().startswith("db")) {
+ switch (Tok.getString()[2]) {
+ case '0': RegNo = X86::DR0; break;
+ case '1': RegNo = X86::DR1; break;
+ case '2': RegNo = X86::DR2; break;
+ case '3': RegNo = X86::DR3; break;
+ case '4': RegNo = X86::DR4; break;
+ case '5': RegNo = X86::DR5; break;
+ case '6': RegNo = X86::DR6; break;
+ case '7': RegNo = X86::DR7; break;
+ }
+
+ if (RegNo != 0) {
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat it.
+ return false;
+ }
+ }
+
+ if (RegNo == 0) {
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ return false;
+}
+
+void X86AsmParser::SetFrameRegister(unsigned RegNo) {
+ Instrumentation->SetInitialFrameRegister(RegNo);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+ unsigned basereg =
+ is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+ Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+ unsigned basereg =
+ is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+ Loc, Loc, 0);
+}
+
+void X86AsmParser::AddDefaultSrcDestOperands(
+ OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+ if (isParsingIntelSyntax()) {
+ Operands.push_back(std::move(Dst));
+ Operands.push_back(std::move(Src));
+ }
+ else {
+ Operands.push_back(std::move(Src));
+ Operands.push_back(std::move(Dst));
+ }
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+ if (isParsingIntelSyntax())
+ return ParseIntelOperand();
+ return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+ unsigned Size = StringSwitch<unsigned>(OpStr)
+ .Cases("BYTE", "byte", 8)
+ .Cases("WORD", "word", 16)
+ .Cases("DWORD", "dword", 32)
+ .Cases("FWORD", "fword", 48)
+ .Cases("QWORD", "qword", 64)
+ .Cases("MMWORD","mmword", 64)
+ .Cases("XWORD", "xword", 80)
+ .Cases("TBYTE", "tbyte", 80)
+ .Cases("XMMWORD", "xmmword", 128)
+ .Cases("YMMWORD", "ymmword", 256)
+ .Cases("ZMMWORD", "zmmword", 512)
+ .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
+ .Default(0);
+ return Size;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+ unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info) {
+ // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
+ // some other label reference.
+ if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
+ // Insert an explicit size if the user didn't have one.
+ if (!Size) {
+ Size = getPointerWidth();
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
+ }
+
+ // Create an absolute memory reference in order to match against
+ // instructions taking a PC relative operand.
+ return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
+ Identifier, Info.OpDecl);
+ }
+
+ // We either have a direct symbol reference, or an offset from a symbol. The
+ // parser always puts the symbol on the LHS, so look there for size
+ // calculation purposes.
+ const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
+ bool IsSymRef =
+ isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
+ if (IsSymRef) {
+ if (!Size) {
+ Size = Info.Type * 8; // Size is in terms of bits in this context.
+ if (Size)
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
+ }
+ }
+
+ // When parsing inline assembly we set the base register to a non-zero value
+ // if we don't know the actual value at this time. This is necessary to
+ // get the matching correct in some cases.
+ BaseReg = BaseReg ? BaseReg : 1;
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, Start, End, Size, Identifier,
+ Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
+ StringRef SymName, int64_t ImmDisp,
+ int64_t FinalImmDisp, SMLoc &BracLoc,
+ SMLoc &StartInBrac, SMLoc &End) {
+ // Remove the '[' and ']' from the IR string.
+ AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
+ AsmRewrites.emplace_back(AOK_Skip, End, 1);
+
+ // If ImmDisp is non-zero, then we parsed a displacement before the
+ // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+ // If ImmDisp doesn't match the displacement computed by the state machine
+ // then we have an additional displacement in the bracketed expression.
+ if (ImmDisp != FinalImmDisp) {
+ if (ImmDisp) {
+ // We have an immediate displacement before the bracketed expression.
+ // Adjust this to match the final immediate displacement.
+ bool Found = false;
+ for (AsmRewrite &AR : AsmRewrites) {
+ if (AR.Loc.getPointer() > BracLoc.getPointer())
+ continue;
+ if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
+ assert (!Found && "ImmDisp already rewritten.");
+ AR.Kind = AOK_Imm;
+ AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
+ AR.Val = FinalImmDisp;
+ Found = true;
+ break;
+ }
+ }
+ assert (Found && "Unable to rewrite ImmDisp.");
+ (void)Found;
+ } else {
+ // We have a symbolic and an immediate displacement, but no displacement
+ // before the bracketed expression. Put the immediate displacement
+ // before the bracketed expression.
+ AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
+ }
+ }
+ // Remove all the ImmPrefix rewrites within the brackets.
+ for (AsmRewrite &AR : AsmRewrites) {
+ if (AR.Loc.getPointer() < StartInBrac.getPointer())
+ continue;
+ if (AR.Kind == AOK_ImmPrefix)
+ AR.Kind = AOK_Delete;
+ }
+ const char *SymLocPtr = SymName.data();
+ // Skip everything before the symbol.
+ if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
+ }
+ // Skip everything after the symbol.
+ if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+ SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
+ }
+}
+
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+
+ AsmToken::TokenKind PrevTK = AsmToken::Error;
+ bool Done = false;
+ while (!Done) {
+ bool UpdateLocLex = true;
+
+ // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+ // identifier. Don't try an parse it as a register.
+ if (Tok.getString().startswith("."))
+ break;
+
+ // If we're parsing an immediate expression, we don't expect a '['.
+ if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+ break;
+
+ AsmToken::TokenKind TK = getLexer().getKind();
+ switch (TK) {
+ default: {
+ if (SM.isValidEndState()) {
+ Done = true;
+ break;
+ }
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ case AsmToken::EndOfStatement: {
+ Done = true;
+ break;
+ }
+ case AsmToken::String:
+ case AsmToken::Identifier: {
+ // This could be a register or a symbolic displacement.
+ unsigned TmpReg;
+ const MCExpr *Val;
+ SMLoc IdentLoc = Tok.getLoc();
+ StringRef Identifier = Tok.getString();
+ if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
+ SM.onRegister(TmpReg);
+ UpdateLocLex = false;
+ break;
+ } else {
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return Error(Tok.getLoc(), "Unexpected identifier!");
+ } else {
+ // This is a dot operator, not an adjacent identifier.
+ if (Identifier.find('.') != StringRef::npos &&
+ PrevTK == AsmToken::RBrac) {
+ return false;
+ } else {
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return true;
+ }
+ }
+ SM.onIdentifierExpr(Val, Identifier);
+ UpdateLocLex = false;
+ break;
+ }
+ return Error(Tok.getLoc(), "Unexpected identifier!");
+ }
+ case AsmToken::Integer: {
+ StringRef ErrMsg;
+ if (isParsingInlineAsm() && SM.getAddImmPrefix())
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
+ // Look for 'b' or 'f' following an Integer as a directional label
+ SMLoc Loc = getTok().getLoc();
+ int64_t IntVal = getTok().getIntVal();
+ End = consumeToken();
+ UpdateLocLex = false;
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ StringRef IDVal = getTok().getString();
+ if (IDVal == "f" || IDVal == "b") {
+ MCSymbol *Sym =
+ getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ const MCExpr *Val =
+ MCSymbolRefExpr::create(Sym, Variant, getContext());
+ if (IDVal == "b" && Sym->isUndefined())
+ return Error(Loc, "invalid reference to undefined symbol");
+ StringRef Identifier = Sym->getName();
+ SM.onIdentifierExpr(Val, Identifier);
+ End = consumeToken();
+ } else {
+ if (SM.onInteger(IntVal, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ } else {
+ if (SM.onInteger(IntVal, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ break;
+ }
+ case AsmToken::Plus: SM.onPlus(); break;
+ case AsmToken::Minus: SM.onMinus(); break;
+ case AsmToken::Tilde: SM.onNot(); break;
+ case AsmToken::Star: SM.onStar(); break;
+ case AsmToken::Slash: SM.onDivide(); break;
+ case AsmToken::Pipe: SM.onOr(); break;
+ case AsmToken::Caret: SM.onXor(); break;
+ case AsmToken::Amp: SM.onAnd(); break;
+ case AsmToken::LessLess:
+ SM.onLShift(); break;
+ case AsmToken::GreaterGreater:
+ SM.onRShift(); break;
+ case AsmToken::LBrac: SM.onLBrac(); break;
+ case AsmToken::RBrac: SM.onRBrac(); break;
+ case AsmToken::LParen: SM.onLParen(); break;
+ case AsmToken::RParen: SM.onRParen(); break;
+ }
+ if (SM.hadError())
+ return Error(Tok.getLoc(), "unknown token in expression");
+
+ if (!Done && UpdateLocLex)
+ End = consumeToken();
+
+ PrevTK = TK;
+ }
+ return false;
+}
+
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp, unsigned Size) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(BracLoc, "Expected '[' token!");
+ Parser.Lex(); // Eat '['
+
+ SMLoc StartInBrac = Tok.getLoc();
+ // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We
+ // may have already parsed an immediate displacement before the bracketed
+ // expression.
+ IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+ if (ParseIntelExpression(SM, End))
+ return nullptr;
+
+ const MCExpr *Disp = nullptr;
+ if (const MCExpr *Sym = SM.getSym()) {
+ // A symbolic displacement.
+ Disp = Sym;
+ if (isParsingInlineAsm())
+ RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
+ ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+ End);
+ }
+
+ if (SM.getImm() || !Disp) {
+ const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext());
+ if (Disp)
+ Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext());
+ else
+ Disp = Imm; // An immediate displacement only.
+ }
+
+ // Parse struct field access. Intel requires a dot, but MSVC doesn't. MSVC
+ // will in fact do global lookup the field name inside all global typedefs,
+ // but we don't emulate that.
+ if (Tok.getString().find('.') != StringRef::npos) {
+ const MCExpr *NewDisp;
+ if (ParseIntelDotOperator(Disp, NewDisp))
+ return nullptr;
+
+ End = Tok.getEndLoc();
+ Parser.Lex(); // Eat the field.
+ Disp = NewDisp;
+ }
+
+ int BaseReg = SM.getBaseReg();
+ int IndexReg = SM.getIndexReg();
+ int Scale = SM.getScale();
+ if (!isParsingInlineAsm()) {
+ // handle [-42]
+ if (!BaseReg && !IndexReg) {
+ if (!SegReg)
+ return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+ Start, End, Size);
+ }
+ StringRef ErrMsg;
+ if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+ Error(StartInBrac, ErrMsg);
+ return nullptr;
+ }
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, Start, End, Size);
+ }
+
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, SM.getSymName(), Info);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+ StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End) {
+ MCAsmParser &Parser = getParser();
+ assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+ Val = nullptr;
+
+ StringRef LineBuf(Identifier.data());
+ void *Result =
+ SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Loc = Tok.getLoc();
+
+ // Advance the token stream until the end of the current token is
+ // after the end of what the frontend claimed.
+ const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+ do {
+ End = Tok.getEndLoc();
+ getLexer().Lex();
+ } while (End.getPointer() < EndPtr);
+ Identifier = LineBuf;
+
+ // The frontend should end parsing on an assembler token boundary, unless it
+ // failed parsing.
+ assert((End.getPointer() == EndPtr || !Result) &&
+ "frontend claimed part of a token?");
+
+ // If the identifier lookup was unsuccessful, assume that we are dealing with
+ // a label.
+ if (!Result) {
+ StringRef InternalName =
+ SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
+ Loc, false);
+ assert(InternalName.size() && "We should have an internal name here.");
+ // Push a rewrite for replacing the identifier name with the internal name.
+ InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+ InternalName);
+ }
+
+ // Create the symbol reference.
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
+ return false;
+}
+
+/// \brief Parse intel style segment override.
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+ unsigned Size) {
+ MCAsmParser &Parser = getParser();
+ assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+ const AsmToken &Tok = Parser.getTok(); // Eat colon.
+ if (Tok.isNot(AsmToken::Colon))
+ return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+ Parser.Lex(); // Eat ':'
+
+ int64_t ImmDisp = 0;
+ if (getLexer().is(AsmToken::Integer)) {
+ ImmDisp = Tok.getIntVal();
+ AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
+ if (isParsingInlineAsm())
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ // An immediate following a 'segment register', 'colon' token sequence can
+ // be followed by a bracketed expression. If it isn't we know we have our
+ // final segment override.
+ const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
+ Start, ImmDispToken.getEndLoc(), Size);
+ }
+ }
+
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+
+ const MCExpr *Val;
+ SMLoc End;
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+ return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
+ }
+
+ InlineAsmIdentifierInfo Info;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return nullptr;
+ return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ if (Tok.getIdentifier().startswith("r")){
+ int rndMode = StringSwitch<int>(Tok.getIdentifier())
+ .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
+ .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF)
+ .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF)
+ .Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
+ .Default(-1);
+ if (-1 == rndMode)
+ return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
+ Parser.Lex(); // Eat "r*" of r*-sae
+ if (!getLexer().is(AsmToken::Minus))
+ return ErrorOperand(Tok.getLoc(), "Expected - at this point");
+ Parser.Lex(); // Eat "-"
+ Parser.Lex(); // Eat the sae
+ if (!getLexer().is(AsmToken::RCurly))
+ return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ const MCExpr *RndModeOp =
+ MCConstantExpr::create(rndMode, Parser.getContext());
+ return X86Operand::CreateImm(RndModeOp, Start, End);
+ }
+ if(Tok.getIdentifier().equals("sae")){
+ Parser.Lex(); // Eat the sae
+ if (!getLexer().is(AsmToken::RCurly))
+ return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ return X86Operand::CreateToken("{sae}", consumedToken);
+ }
+ return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+}
+/// ParseIntelMemOperand - Parse intel style memory operand.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
+ SMLoc Start,
+ unsigned Size) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc End;
+
+ // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+ assert(ImmDisp == 0);
+
+ const MCExpr *Val;
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+ return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
+ }
+
+ InlineAsmIdentifierInfo Info;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return nullptr;
+
+ if (!getLexer().is(AsmToken::LBrac))
+ return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size, Identifier, Info);
+
+ Parser.Lex(); // Eat '['
+
+ // Parse Identifier [ ImmDisp ]
+ IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
+ /*AddImmPrefix=*/false);
+ if (ParseIntelExpression(SM, End))
+ return nullptr;
+
+ if (SM.getSym()) {
+ Error(Start, "cannot use more than one symbol in memory operand");
+ return nullptr;
+ }
+ if (SM.getBaseReg()) {
+ Error(Start, "cannot use base register with variable reference");
+ return nullptr;
+ }
+ if (SM.getIndexReg()) {
+ Error(Start, "cannot use index register with variable reference");
+ return nullptr;
+ }
+
+ const MCExpr *Disp = MCConstantExpr::create(SM.getImm(), getContext());
+ // BaseReg is non-zero to avoid assertions. In the context of inline asm,
+ // we're pointing to a local variable in memory, so the base register is
+ // really the frame or stack pointer.
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1,
+ Start, End, Size, Identifier, Info.OpDecl);
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+ const MCExpr *&NewDisp) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ int64_t OrigDispVal, DotDispVal;
+
+ // FIXME: Handle non-constant expressions.
+ if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
+ OrigDispVal = OrigDisp->getValue();
+ else
+ return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
+
+ // Drop the optional '.'.
+ StringRef DotDispStr = Tok.getString();
+ if (DotDispStr.startswith("."))
+ DotDispStr = DotDispStr.drop_front(1);
+
+ // .Imm gets lexed as a real.
+ if (Tok.is(AsmToken::Real)) {
+ APInt DotDisp;
+ DotDispStr.getAsInteger(10, DotDisp);
+ DotDispVal = DotDisp.getZExtValue();
+ } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+ unsigned DotDisp;
+ std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+ if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+ DotDisp))
+ return Error(Tok.getLoc(), "Unable to lookup field reference!");
+ DotDispVal = DotDisp;
+ } else
+ return Error(Tok.getLoc(), "Unexpected token type!");
+
+ if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+ SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+ unsigned Len = DotDispStr.size();
+ unsigned Val = OrigDispVal + DotDispVal;
+ InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+ }
+
+ NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
+ return false;
+}
+
+/// Parse the 'offset' operator. This operator is used to specify the
+/// location rather then the content of a variable.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc OffsetOfLoc = Tok.getLoc();
+ Parser.Lex(); // Eat offset.
+
+ const MCExpr *Val;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
+ return nullptr;
+
+ // Don't emit the offset operator.
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
+
+ // The offset operator will have an 'r' constraint, thus we need to create
+ // register operand to ensure proper matching. Just pick a GPR based on
+ // the size of a pointer.
+ unsigned RegNo =
+ is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
+ return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+ OffsetOfLoc, Identifier, Info.OpDecl);
+}
+
+enum IntelOperatorKind {
+ IOK_LENGTH,
+ IOK_SIZE,
+ IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZE operator returns the size of a C or C++
+/// variable. A variable's size is the product of its LENGTH and TYPE. The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc TypeLoc = Tok.getLoc();
+ Parser.Lex(); // Eat operator.
+
+ const MCExpr *Val = nullptr;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/true, End))
+ return nullptr;
+
+ if (!Info.OpDecl)
+ return ErrorOperand(Start, "unable to lookup expression");
+
+ unsigned CVal = 0;
+ switch(OpKind) {
+ default: llvm_unreachable("Unexpected operand kind!");
+ case IOK_LENGTH: CVal = Info.Length; break;
+ case IOK_SIZE: CVal = Info.Size; break;
+ case IOK_TYPE: CVal = Info.Type; break;
+ }
+
+ // Rewrite the type operator and the C or C++ type or variable in terms of an
+ // immediate. E.g. TYPE foo -> $$4
+ unsigned Len = End.getPointer() - TypeLoc.getPointer();
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
+
+ const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
+ return X86Operand::CreateImm(Imm, Start, End);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Start, End;
+
+ // Offset, length, type and size operators.
+ if (isParsingInlineAsm()) {
+ StringRef AsmTokStr = Tok.getString();
+ if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+ return ParseIntelOffsetOfOperator();
+ if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+ return ParseIntelOperator(IOK_LENGTH);
+ if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+ return ParseIntelOperator(IOK_SIZE);
+ if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+ return ParseIntelOperator(IOK_TYPE);
+ }
+
+ bool PtrInOperand = false;
+ unsigned Size = getIntelMemOperandSize(Tok.getString());
+ if (Size) {
+ Parser.Lex(); // Eat operand size (e.g., byte, word).
+ if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+ return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
+ Parser.Lex(); // Eat ptr.
+ PtrInOperand = true;
+ }
+ Start = Tok.getLoc();
+
+ // Immediate.
+ if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
+ getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) {
+ AsmToken StartTok = Tok;
+ IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+ /*AddImmPrefix=*/false);
+ if (ParseIntelExpression(SM, End))
+ return nullptr;
+
+ int64_t Imm = SM.getImm();
+ if (isParsingInlineAsm()) {
+ unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+ if (StartTok.getString().size() == Len)
+ // Just add a prefix if this wasn't a complex immediate expression.
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
+ else
+ // Otherwise, rewrite the complex expression as a single immediate.
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
+ }
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ // If a directional label (ie. 1f or 2b) was parsed above from
+ // ParseIntelExpression() then SM.getSym() was set to a pointer to
+ // to the MCExpr with the directional local symbol and this is a
+ // memory operand not an immediate operand.
+ if (SM.getSym())
+ return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
+ Size);
+
+ const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
+ return X86Operand::CreateImm(ImmExpr, Start, End);
+ }
+
+ // Only positive immediates are valid.
+ if (Imm < 0)
+ return ErrorOperand(Start, "expected a positive immediate displacement "
+ "before bracketed expr.");
+
+ // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+ return ParseIntelMemOperand(Imm, Start, Size);
+ }
+
+ // rounding mode token
+ if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
+ getLexer().is(AsmToken::LCurly))
+ return ParseRoundingModeOp(Start, End);
+
+ // Register.
+ unsigned RegNo = 0;
+ if (!ParseRegister(RegNo, Start, End)) {
+ // If this is a segment register followed by a ':', then this is the start
+ // of a segment override, otherwise this is a normal register reference.
+ // In case it is a normal register and there is ptr in the operand this
+ // is an error
+ if (getLexer().isNot(AsmToken::Colon)){
+ if (PtrInOperand){
+ return ErrorOperand(Start, "expected memory operand after "
+ "'ptr', found register operand instead");
+ }
+ return X86Operand::CreateReg(RegNo, Start, End);
+ }
+
+ return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
+ }
+
+ // Memory operand.
+ return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+ MCAsmParser &Parser = getParser();
+ switch (getLexer().getKind()) {
+ default:
+ // Parse a memory operand with no segment register.
+ return ParseMemOperand(0, Parser.getTok().getLoc());
+ case AsmToken::Percent: {
+ // Read the register.
+ unsigned RegNo;
+ SMLoc Start, End;
+ if (ParseRegister(RegNo, Start, End)) return nullptr;
+ if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+ Error(Start, "%eiz and %riz can only be used as index registers",
+ SMRange(Start, End));
+ return nullptr;
+ }
+
+ // If this is a segment register followed by a ':', then this is the start
+ // of a memory reference, otherwise this is a normal register reference.
+ if (getLexer().isNot(AsmToken::Colon))
+ return X86Operand::CreateReg(RegNo, Start, End);
+
+ if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+ return ErrorOperand(Start, "invalid segment register");
+
+ getParser().Lex(); // Eat the colon.
+ return ParseMemOperand(RegNo, Start);
+ }
+ case AsmToken::Dollar: {
+ // $42 -> immediate.
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ Parser.Lex();
+ const MCExpr *Val;
+ if (getParser().parseExpression(Val, End))
+ return nullptr;
+ return X86Operand::CreateImm(Val, Start, End);
+ }
+ case AsmToken::LCurly:{
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ if (getSTI().getFeatureBits()[X86::FeatureAVX512])
+ return ParseRoundingModeOp(Start, End);
+ return ErrorOperand(Start, "unknown token in expression");
+ }
+ }
+}
+
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+ const MCParsedAsmOperand &Op) {
+ MCAsmParser &Parser = getParser();
+ if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+ if (getLexer().is(AsmToken::LCurly)) {
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ // Distinguish {1to<NUM>} from {%k<NUM>}.
+ if(getLexer().is(AsmToken::Integer)) {
+ // Parse memory broadcasting ({1to<NUM>}).
+ if (getLexer().getTok().getIntVal() != 1)
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Expected 1to<NUM> at this point");
+ Parser.Lex(); // Eat "1" of 1to8
+ if (!getLexer().is(AsmToken::Identifier) ||
+ !getLexer().getTok().getIdentifier().startswith("to"))
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Expected 1to<NUM> at this point");
+ // Recognize only reasonable suffixes.
+ const char *BroadcastPrimitive =
+ StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+ .Case("to2", "{1to2}")
+ .Case("to4", "{1to4}")
+ .Case("to8", "{1to8}")
+ .Case("to16", "{1to16}")
+ .Default(nullptr);
+ if (!BroadcastPrimitive)
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Invalid memory broadcast primitive.");
+ Parser.Lex(); // Eat "toN" of 1toN
+ if (!getLexer().is(AsmToken::RCurly))
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+ consumedToken));
+ // No AVX512 specific primitives can pass
+ // after memory broadcasting, so return.
+ return true;
+ } else {
+ // Parse mask register {%k1}
+ Operands.push_back(X86Operand::CreateToken("{", consumedToken));
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ Operands.push_back(std::move(Op));
+ if (!getLexer().is(AsmToken::RCurly))
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Expected } at this point");
+ Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+
+ // Parse "zeroing non-masked" semantic {z}
+ if (getLexer().is(AsmToken::LCurly)) {
+ Operands.push_back(X86Operand::CreateToken("{z}", consumeToken()));
+ if (!getLexer().is(AsmToken::Identifier) ||
+ getLexer().getTok().getIdentifier() != "z")
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Expected z at this point");
+ Parser.Lex(); // Eat the z
+ if (!getLexer().is(AsmToken::RCurly))
+ return !ErrorAndEatStatement(getLexer().getLoc(),
+ "Expected } at this point");
+ Parser.Lex(); // Eat the }
+ }
+ }
+ }
+ }
+ }
+ return true;
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
+/// has already been parsed if present.
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+ SMLoc MemStart) {
+
+ MCAsmParser &Parser = getParser();
+ // We have to disambiguate a parenthesized expression "(4+5)" from the start
+ // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
+ // only way to do this without lookahead is to eat the '(' and see what is
+ // after it.
+ const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext());
+ if (getLexer().isNot(AsmToken::LParen)) {
+ SMLoc ExprEnd;
+ if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd);
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+ MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ } else {
+ // Okay, we have a '('. We don't know if this is an expression or not, but
+ // so we have to eat the ( to see beyond it.
+ SMLoc LParenLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat the '('.
+
+ if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+ // Nothing to do here, fall into the code below with the '(' part of the
+ // memory operand consumed.
+ } else {
+ SMLoc ExprEnd;
+
+ // It must be an parenthesized expression, parse it now.
+ if (getParser().parseParenExpression(Disp, ExprEnd))
+ return nullptr;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc,
+ ExprEnd);
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+ MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ }
+ }
+
+ // If we reached here, then we just ate the ( of the memory operand. Process
+ // the rest of the memory operand.
+ unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+ SMLoc IndexLoc, BaseLoc;
+
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc StartLoc, EndLoc;
+ BaseLoc = Parser.getTok().getLoc();
+ if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
+ if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+ Error(StartLoc, "eiz and riz can only be used as index registers",
+ SMRange(StartLoc, EndLoc));
+ return nullptr;
+ }
+ }
+
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+ IndexLoc = Parser.getTok().getLoc();
+
+ // Following the comma we should have either an index register, or a scale
+ // value. We don't support the later form, but we want to parse it
+ // correctly.
+ //
+ // Not that even though it would be completely consistent to support syntax
+ // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc L;
+ if (ParseRegister(IndexReg, L, L)) return nullptr;
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ // Parse the scale amount:
+ // ::= ',' [scale-expression]
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(Parser.getTok().getLoc(),
+ "expected comma in scale expression");
+ return nullptr;
+ }
+ Parser.Lex(); // Eat the comma.
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t ScaleVal;
+ if (getParser().parseAbsoluteExpression(ScaleVal)){
+ Error(Loc, "expected scale expression");
+ return nullptr;
+ }
+
+ // Validate the scale amount.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ ScaleVal != 1) {
+ Error(Loc, "scale factor in 16-bit address must be 1");
+ return nullptr;
+ }
+ if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
+ ScaleVal != 8) {
+ Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+ return nullptr;
+ }
+ Scale = (unsigned)ScaleVal;
+ }
+ }
+ } else if (getLexer().isNot(AsmToken::RParen)) {
+ // A scale amount without an index is ignored.
+ // index.
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t Value;
+ if (getParser().parseAbsoluteExpression(Value))
+ return nullptr;
+
+ if (Value != 1)
+ Warning(Loc, "scale factor without index register is ignored");
+ Scale = 1;
+ }
+ }
+
+ // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+ return nullptr;
+ }
+ SMLoc MemEnd = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat the ')'.
+
+ // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+ // and then only in non-64-bit modes. Except for DX, which is a special case
+ // because an unofficial form of in/out instructions uses it.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
+ BaseReg != X86::SI && BaseReg != X86::DI)) &&
+ BaseReg != X86::DX) {
+ Error(BaseLoc, "invalid 16-bit base register");
+ return nullptr;
+ }
+ if (BaseReg == 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+ Error(IndexLoc, "16-bit memory operand may not include only index register");
+ return nullptr;
+ }
+
+ StringRef ErrMsg;
+ if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+ Error(BaseLoc, ErrMsg);
+ return nullptr;
+ }
+
+ if (SegReg || BaseReg || IndexReg)
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, MemStart, MemEnd);
+ return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
+}
+
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ InstInfo = &Info;
+ StringRef PatchedName = Name;
+
+ // FIXME: Hack to recognize setneb as setne.
+ if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+ PatchedName != "setb" && PatchedName != "setnb")
+ PatchedName = PatchedName.substr(0, Name.size()-1);
+
+ // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+ if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+ (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+ PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+ bool IsVCMP = PatchedName[0] == 'v';
+ unsigned CCIdx = IsVCMP ? 4 : 3;
+ unsigned ComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(CCIdx, PatchedName.size() - 2))
+ .Case("eq", 0x00)
+ .Case("lt", 0x01)
+ .Case("le", 0x02)
+ .Case("unord", 0x03)
+ .Case("neq", 0x04)
+ .Case("nlt", 0x05)
+ .Case("nle", 0x06)
+ .Case("ord", 0x07)
+ /* AVX only from here */
+ .Case("eq_uq", 0x08)
+ .Case("nge", 0x09)
+ .Case("ngt", 0x0A)
+ .Case("false", 0x0B)
+ .Case("neq_oq", 0x0C)
+ .Case("ge", 0x0D)
+ .Case("gt", 0x0E)
+ .Case("true", 0x0F)
+ .Case("eq_os", 0x10)
+ .Case("lt_oq", 0x11)
+ .Case("le_oq", 0x12)
+ .Case("unord_s", 0x13)
+ .Case("neq_us", 0x14)
+ .Case("nlt_uq", 0x15)
+ .Case("nle_uq", 0x16)
+ .Case("ord_s", 0x17)
+ .Case("eq_us", 0x18)
+ .Case("nge_uq", 0x19)
+ .Case("ngt_uq", 0x1A)
+ .Case("false_os", 0x1B)
+ .Case("neq_os", 0x1C)
+ .Case("ge_oq", 0x1D)
+ .Case("gt_oq", 0x1E)
+ .Case("true_us", 0x1F)
+ .Default(~0U);
+ if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
+
+ Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
+ NameLoc));
+
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+ PatchedName = PatchedName.substr(PatchedName.size() - 2);
+ }
+ }
+
+ // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+ if (PatchedName.startswith("vpcmp") &&
+ (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+ PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+ unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned ComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - CCIdx))
+ .Case("eq", 0x0) // Only allowed on unsigned. Checked below.
+ .Case("lt", 0x1)
+ .Case("le", 0x2)
+ //.Case("false", 0x3) // Not a documented alias.
+ .Case("neq", 0x4)
+ .Case("nlt", 0x5)
+ .Case("nle", 0x6)
+ //.Case("true", 0x7) // Not a documented alias.
+ .Default(~0U);
+ if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
+ Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
+
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+ PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ }
+ }
+
+ // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+ if (PatchedName.startswith("vpcom") &&
+ (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+ PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+ unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned ComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - CCIdx))
+ .Case("lt", 0x0)
+ .Case("le", 0x1)
+ .Case("gt", 0x2)
+ .Case("ge", 0x3)
+ .Case("eq", 0x4)
+ .Case("neq", 0x5)
+ .Case("false", 0x6)
+ .Case("true", 0x7)
+ .Default(~0U);
+ if (ComparisonCode != ~0U) {
+ Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
+
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+ PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ }
+ }
+
+ Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+ // Determine whether this is an instruction prefix.
+ bool isPrefix =
+ Name == "lock" || Name == "rep" ||
+ Name == "repe" || Name == "repz" ||
+ Name == "repne" || Name == "repnz" ||
+ Name == "rex64" || Name == "data16";
+
+ // This does the actual operand parsing. Don't parse any more if we have a
+ // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+ // just want to parse the "lock" as the first instruction and the "incl" as
+ // the next one.
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+ // Parse '*' modifier.
+ if (getLexer().is(AsmToken::Star))
+ Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+ // Read the operands.
+ while(1) {
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ Operands.push_back(std::move(Op));
+ if (!HandleAVX512Operand(Operands, *Operands.back()))
+ return true;
+ } else {
+ Parser.eatToEndOfStatement();
+ return true;
+ }
+ // check for comma and eat it
+ if (getLexer().is(AsmToken::Comma))
+ Parser.Lex();
+ else
+ break;
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return ErrorAndEatStatement(getLexer().getLoc(),
+ "unexpected token in argument list");
+ }
+
+ // Consume the EndOfStatement or the prefix separator Slash
+ if (getLexer().is(AsmToken::EndOfStatement) ||
+ (isPrefix && getLexer().is(AsmToken::Slash)))
+ Parser.Lex();
+
+ // This is for gas compatibility and cannot be done in td.
+ // Adding "p" for some floating point with no argument.
+ // For example: fsub --> fsubp
+ bool IsFp =
+ Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+ if (IsFp && Operands.size() == 1) {
+ const char *Repl = StringSwitch<const char *>(Name)
+ .Case("fsub", "fsubp")
+ .Case("fdiv", "fdivp")
+ .Case("fsubr", "fsubrp")
+ .Case("fdivr", "fdivrp");
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+ }
+
+ // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
+ // "outb %al, %dx". Out doesn't take a memory form, but this is a widely
+ // documented form in various unofficial manuals, so a lot of code uses it.
+ if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+ Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands.back();
+ if (Op.isMem() && Op.Mem.SegReg == 0 &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+ SMLoc Loc = Op.getEndLoc();
+ Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+ }
+ }
+ // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
+ if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+ Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ if (Op.isMem() && Op.Mem.SegReg == 0 &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+ SMLoc Loc = Op.getEndLoc();
+ Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+ }
+ }
+
+ // Append default arguments to "ins[bwld]"
+ if (Name.startswith("ins") && Operands.size() == 1 &&
+ (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) {
+ AddDefaultSrcDestOperands(Operands,
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+ DefaultMemDIOperand(NameLoc));
+ }
+
+ // Append default arguments to "outs[bwld]"
+ if (Name.startswith("outs") && Operands.size() == 1 &&
+ (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+ Name == "outsd" )) {
+ AddDefaultSrcDestOperands(Operands,
+ DefaultMemSIOperand(NameLoc),
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+ }
+
+ // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+ // values of $SIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("lods") && Operands.size() == 1 &&
+ (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+ Name == "lodsl" || Name == "lodsd" || Name == "lodsq"))
+ Operands.push_back(DefaultMemSIOperand(NameLoc));
+
+ // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+ // values of $DIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("stos") && Operands.size() == 1 &&
+ (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+ Name == "stosl" || Name == "stosd" || Name == "stosq"))
+ Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+ // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+ // values of $DIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("scas") && Operands.size() == 1 &&
+ (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+ Name == "scasl" || Name == "scasd" || Name == "scasq"))
+ Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+ // Add default SI and DI operands to "cmps[bwlq]".
+ if (Name.startswith("cmps") &&
+ (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+ Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+ if (Operands.size() == 1) {
+ AddDefaultSrcDestOperands(Operands,
+ DefaultMemDIOperand(NameLoc),
+ DefaultMemSIOperand(NameLoc));
+ } else if (Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
+ if (!doSrcDstMatch(Op, Op2))
+ return Error(Op.getStartLoc(),
+ "mismatching source and destination index registers");
+ }
+ }
+
+ // Add default SI and DI operands to "movs[bwlq]".
+ if ((Name.startswith("movs") &&
+ (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+ Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+ (Name.startswith("smov") &&
+ (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+ Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
+ if (Operands.size() == 1) {
+ if (Name == "movsd")
+ Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+ AddDefaultSrcDestOperands(Operands,
+ DefaultMemSIOperand(NameLoc),
+ DefaultMemDIOperand(NameLoc));
+ } else if (Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
+ if (!doSrcDstMatch(Op, Op2))
+ return Error(Op.getStartLoc(),
+ "mismatching source and destination index registers");
+ }
+ }
+
+ // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to
+ // "shift <op>".
+ if ((Name.startswith("shr") || Name.startswith("sar") ||
+ Name.startswith("shl") || Name.startswith("sal") ||
+ Name.startswith("rcl") || Name.startswith("rcr") ||
+ Name.startswith("rol") || Name.startswith("ror")) &&
+ Operands.size() == 3) {
+ if (isParsingIntelSyntax()) {
+ // Intel syntax
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+ Operands.pop_back();
+ } else {
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+ cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+ Operands.erase(Operands.begin() + 1);
+ }
+ }
+
+ // Transforms "int $3" into "int3" as a size optimization. We can't write an
+ // instalias with an immediate operand yet.
+ if (Name == "int" && Operands.size() == 2) {
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isImm())
+ if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
+ if (CE->getValue() == 3) {
+ Operands.erase(Operands.begin() + 1);
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+ }
+ }
+
+ return false;
+}
+
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
+ switch (Inst.getOpcode()) {
+ default: return false;
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+ !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+ !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
+ return false;
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ }
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+ MCStreamer &Out) {
+ Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(),
+ MII, Out);
+}
+
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ if (isParsingIntelSyntax())
+ return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+ MatchingInlineAsm);
+ return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+ MatchingInlineAsm);
+}
+
+void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
+ OperandVector &Operands, MCStreamer &Out,
+ bool MatchingInlineAsm) {
+ // FIXME: This should be replaced with a real .td file alias mechanism.
+ // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+ // call.
+ const char *Repl = StringSwitch<const char *>(Op.getToken())
+ .Case("finit", "fninit")
+ .Case("fsave", "fnsave")
+ .Case("fstcw", "fnstcw")
+ .Case("fstcww", "fnstcw")
+ .Case("fstenv", "fnstenv")
+ .Case("fstsw", "fnstsw")
+ .Case("fstsww", "fnstsw")
+ .Case("fclex", "fnclex")
+ .Default(nullptr);
+ if (Repl) {
+ MCInst Inst;
+ Inst.setOpcode(X86::WAIT);
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+ }
+}
+
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(ErrorInfo && "Unknown missing feature!");
+ ArrayRef<SMRange> EmptyRanges = None;
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "instruction requires:";
+ uint64_t Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+ if (ErrorInfo & Mask)
+ OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
+ Mask <<= 1;
+ }
+ return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+}
+
+bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+ assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+ ArrayRef<SMRange> EmptyRanges = None;
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+ bool WasOriginallyInvalidOperand = false;
+ MCInst Inst;
+
+ // First, try a direct match.
+ switch (MatchInstructionImpl(Operands, Inst,
+ ErrorInfo, MatchingInlineAsm,
+ isParsingIntelSyntax())) {
+ default: llvm_unreachable("Unexpected match result!");
+ case Match_Success:
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ case Match_MissingFeature:
+ return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
+ case Match_InvalidOperand:
+ WasOriginallyInvalidOperand = true;
+ break;
+ case Match_MnemonicFail:
+ break;
+ }
+
+ // FIXME: Ideally, we would only attempt suffix matches for things which are
+ // valid prefixes, and we could just infer the right unambiguous
+ // type. However, that requires substantially more matcher support than the
+ // following hack.
+
+ // Change the operand to point to a temporary token.
+ StringRef Base = Op.getToken();
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += ' ';
+ Op.setTokenValue(Tmp);
+
+ // If this instruction starts with an 'f', then it is a floating point stack
+ // instruction. These come in up to three forms for 32-bit, 64-bit, and
+ // 80-bit floating point, which use the suffixes s,l,t respectively.
+ //
+ // Otherwise, we assume that this may be an integer instruction, which comes
+ // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+ const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+ // Check for the various suffix matches.
+ uint64_t ErrorInfoIgnore;
+ uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+ unsigned Match[4];
+
+ for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
+ Tmp.back() = Suffixes[I];
+ Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+ MatchingInlineAsm, isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match[I] == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ }
+
+ // Restore the old token.
+ Op.setTokenValue(Base);
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ unsigned NumSuccessfulMatches =
+ std::count(std::begin(Match), std::end(Match), Match_Success);
+ if (NumSuccessfulMatches == 1) {
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ }
+
+ // Otherwise, the match failed, try to produce a decent error message.
+
+ // If we had multiple suffix matches, then identify this as an ambiguous
+ // match.
+ if (NumSuccessfulMatches > 1) {
+ char MatchChars[4];
+ unsigned NumMatches = 0;
+ for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
+ if (Match[I] == Match_Success)
+ MatchChars[NumMatches++] = Suffixes[I];
+
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "ambiguous instructions require an explicit suffix (could be ";
+ for (unsigned i = 0; i != NumMatches; ++i) {
+ if (i != 0)
+ OS << ", ";
+ if (i + 1 == NumMatches)
+ OS << "or ";
+ OS << "'" << Base << MatchChars[i] << "'";
+ }
+ OS << ")";
+ Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+ return true;
+ }
+
+ // Okay, we know that none of the variants matched successfully.
+
+ // If all of the instructions reported an invalid mnemonic, then the original
+ // mnemonic was invalid.
+ if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
+ if (!WasOriginallyInvalidOperand) {
+ ArrayRef<SMRange> Ranges =
+ MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
+ return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+ Ranges, MatchingInlineAsm);
+ }
+
+ // Recover location info for the operand if we know which was the problem.
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction",
+ EmptyRanges, MatchingInlineAsm);
+
+ X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+ if (Operand.getStartLoc().isValid()) {
+ SMRange OperandRange = Operand.getLocRange();
+ return Error(Operand.getStartLoc(), "invalid operand for instruction",
+ OperandRange, MatchingInlineAsm);
+ }
+ }
+
+ return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_MissingFeature) == 1) {
+ ErrorInfo = ErrorInfoMissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidOperand) == 1) {
+ return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ MatchingInlineAsm);
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+ EmptyRanges, MatchingInlineAsm);
+ return true;
+}
+
+bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+ assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+ StringRef Mnemonic = Op.getToken();
+ ArrayRef<SMRange> EmptyRanges = None;
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+ MCInst Inst;
+
+ // Find one unsized memory operand, if present.
+ X86Operand *UnsizedMemOp = nullptr;
+ for (const auto &Op : Operands) {
+ X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+ if (X86Op->isMemUnsized())
+ UnsizedMemOp = X86Op;
+ }
+
+ // Allow some instructions to have implicitly pointer-sized operands. This is
+ // compatible with gas.
+ if (UnsizedMemOp) {
+ static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
+ for (const char *Instr : PtrSizedInstrs) {
+ if (Mnemonic == Instr) {
+ UnsizedMemOp->Mem.Size = getPointerWidth();
+ break;
+ }
+ }
+ }
+
+ // If an unsized memory operand is present, try to match with each memory
+ // operand size. In Intel assembly, the size is not part of the instruction
+ // mnemonic.
+ SmallVector<unsigned, 8> Match;
+ uint64_t ErrorInfoMissingFeature = 0;
+ if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
+ static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
+ for (unsigned Size : MopSizes) {
+ UnsizedMemOp->Mem.Size = Size;
+ uint64_t ErrorInfoIgnore;
+ unsigned LastOpcode = Inst.getOpcode();
+ unsigned M =
+ MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+ MatchingInlineAsm, isParsingIntelSyntax());
+ if (Match.empty() || LastOpcode != Inst.getOpcode())
+ Match.push_back(M);
+
+ // If this returned as a missing feature failure, remember that.
+ if (Match.back() == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ }
+
+ // Restore the size of the unsized memory operand if we modified it.
+ if (UnsizedMemOp)
+ UnsizedMemOp->Mem.Size = 0;
+ }
+
+ // If we haven't matched anything yet, this is not a basic integer or FPU
+ // operation. There shouldn't be any ambiguity in our mnemonic table, so try
+ // matching with the unsized operand.
+ if (Match.empty()) {
+ Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo,
+ MatchingInlineAsm,
+ isParsingIntelSyntax()));
+ // If this returned as a missing feature failure, remember that.
+ if (Match.back() == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfo;
+ }
+
+ // Restore the size of the unsized memory operand if we modified it.
+ if (UnsizedMemOp)
+ UnsizedMemOp->Mem.Size = 0;
+
+ // If it's a bad mnemonic, all results will be the same.
+ if (Match.back() == Match_MnemonicFail) {
+ ArrayRef<SMRange> Ranges =
+ MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
+ return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
+ Ranges, MatchingInlineAsm);
+ }
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ unsigned NumSuccessfulMatches =
+ std::count(std::begin(Match), std::end(Match), Match_Success);
+ if (NumSuccessfulMatches == 1) {
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the individual
+ // transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ EmitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ } else if (NumSuccessfulMatches > 1) {
+ assert(UnsizedMemOp &&
+ "multiple matches only possible with unsized memory operands");
+ ArrayRef<SMRange> Ranges =
+ MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange();
+ return Error(UnsizedMemOp->getStartLoc(),
+ "ambiguous operand size for instruction '" + Mnemonic + "\'",
+ Ranges, MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_MissingFeature) == 1) {
+ ErrorInfo = ErrorInfoMissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidOperand) == 1) {
+ return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ MatchingInlineAsm);
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges,
+ MatchingInlineAsm);
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+ return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+ MCAsmParser &Parser = getParser();
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal == ".word")
+ return ParseDirectiveWord(2, DirectiveID.getLoc());
+ else if (IDVal.startswith(".code"))
+ return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+ else if (IDVal.startswith(".att_syntax")) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (Parser.getTok().getString() == "prefix")
+ Parser.Lex();
+ else if (Parser.getTok().getString() == "noprefix")
+ return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
+ "supported: registers must have a "
+ "'%' prefix in .att_syntax");
+ }
+ getParser().setAssemblerDialect(0);
+ return false;
+ } else if (IDVal.startswith(".intel_syntax")) {
+ getParser().setAssemblerDialect(1);
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (Parser.getTok().getString() == "noprefix")
+ Parser.Lex();
+ else if (Parser.getTok().getString() == "prefix")
+ return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
+ "supported: registers must not have "
+ "a '%' prefix in .intel_syntax");
+ }
+ return false;
+ } else if (IDVal == ".even")
+ return parseDirectiveEven(DirectiveID.getLoc());
+ return true;
+}
+
+/// parseDirectiveEven
+/// ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+ const MCSection *Section = getStreamer().getCurrentSection().first;
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ TokError("unexpected token in directive");
+ return false;
+ }
+ if (!Section) {
+ getStreamer().InitSections(false);
+ Section = getStreamer().getCurrentSection().first;
+ }
+ if (Section->UseCodeAlign())
+ getStreamer().EmitCodeAlignment(2, 0);
+ else
+ getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+ return false;
+}
+/// ParseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ SMLoc ExprLoc = getLexer().getLoc();
+ if (getParser().parseExpression(Value))
+ return false;
+
+ if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else {
+ getStreamer().EmitValue(Value, Size, ExprLoc);
+ }
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(L, "unexpected token in directive");
+ return false;
+ }
+ Parser.Lex();
+ }
+ }
+
+ Parser.Lex();
+ return false;
+}
+
+/// ParseDirectiveCode
+/// ::= .code16 | .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (IDVal == ".code16") {
+ Parser.Lex();
+ if (!is16BitMode()) {
+ SwitchMode(X86::Mode16Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ }
+ } else if (IDVal == ".code32") {
+ Parser.Lex();
+ if (!is32BitMode()) {
+ SwitchMode(X86::Mode32Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ }
+ } else if (IDVal == ".code64") {
+ Parser.Lex();
+ if (!is64BitMode()) {
+ SwitchMode(X86::Mode64Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+ }
+ } else {
+ Error(L, "unknown directive " + IDVal);
+ return false;
+ }
+
+ return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+ RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
+ RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 0000000..54538c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,39 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+ return isInt<8>(Value) ||
+ (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+ return isInt<8>(Value) ||
+ (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+ return isInt<8>(Value);
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+ return isInt<32>(Value);
+}
+
+inline bool isImmUnsignedi8Value(uint64_t Value) {
+ return isUInt<8>(Value) || isInt<8>(Value);
+}
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 0000000..7ec0240
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,543 @@
+//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+
+#include "X86AsmParserCommon.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Register,
+ Immediate,
+ Memory
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+ SMLoc OffsetOfLoc;
+ StringRef SymName;
+ void *OpDecl;
+ bool AddressOf;
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNo;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ struct MemOp {
+ unsigned SegReg;
+ const MCExpr *Disp;
+ unsigned BaseReg;
+ unsigned IndexReg;
+ unsigned Scale;
+ unsigned Size;
+ unsigned ModeSize;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ };
+
+ X86Operand(KindTy K, SMLoc Start, SMLoc End)
+ : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+ StringRef getSymName() override { return SymName; }
+ void *getOpDecl() override { return OpDecl; }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+ /// getLocRange - Get the range between the first and last token of this
+ /// operand.
+ SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+ /// getOffsetOfLoc - Get the location of the offset operator.
+ SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+ void print(raw_ostream &OS) const override {}
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ void setTokenValue(StringRef Value) {
+ assert(Kind == Token && "Invalid access!");
+ Tok.Data = Value.data();
+ Tok.Length = Value.size();
+ }
+
+ unsigned getReg() const override {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNo;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getMemDisp() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Disp;
+ }
+ unsigned getMemSegReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.SegReg;
+ }
+ unsigned getMemBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.BaseReg;
+ }
+ unsigned getMemIndexReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg;
+ }
+ unsigned getMemScale() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Scale;
+ }
+ unsigned getMemModeSize() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.ModeSize;
+ }
+
+ bool isToken() const override {return Kind == Token; }
+
+ bool isImm() const override { return Kind == Immediate; }
+
+ bool isImmSExti16i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti16i8Value(CE->getValue());
+ }
+ bool isImmSExti32i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti32i8Value(CE->getValue());
+ }
+ bool isImmSExti64i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i8Value(CE->getValue());
+ }
+ bool isImmSExti64i32() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i32Value(CE->getValue());
+ }
+
+ bool isImmUnsignedi8() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ return isImmUnsignedi8Value(CE->getValue());
+ }
+
+ bool isOffsetOf() const override {
+ return OffsetOfLoc.getPointer();
+ }
+
+ bool needAddressOf() const override {
+ return AddressOf;
+ }
+
+ bool isMem() const override { return Kind == Memory; }
+ bool isMemUnsized() const {
+ return Kind == Memory && Mem.Size == 0;
+ }
+ bool isMem8() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMem16() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMem32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMem64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMem80() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+ }
+ bool isMem128() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+ }
+ bool isMem256() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+ }
+ bool isMem512() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+ }
+
+ bool isMemVX32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+ }
+ bool isMemVX32X() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
+ }
+ bool isMemVY32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+ }
+ bool isMemVY32X() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
+ }
+ bool isMemVX64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+ }
+ bool isMemVX64X() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
+ }
+ bool isMemVY64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+ }
+ bool isMemVY64X() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
+ }
+ bool isMemVZ32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+ }
+ bool isMemVZ64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+ }
+
+ bool isAbsMem() const {
+ return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+ !getMemIndexReg() && getMemScale() == 1;
+ }
+ bool isAVX512RC() const{
+ return isImm();
+ }
+
+ bool isAbsMem16() const {
+ return isAbsMem() && Mem.ModeSize == 16;
+ }
+
+ bool isSrcIdx() const {
+ return !getMemIndexReg() && getMemScale() == 1 &&
+ (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+ getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+ cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+ }
+ bool isSrcIdx8() const {
+ return isMem8() && isSrcIdx();
+ }
+ bool isSrcIdx16() const {
+ return isMem16() && isSrcIdx();
+ }
+ bool isSrcIdx32() const {
+ return isMem32() && isSrcIdx();
+ }
+ bool isSrcIdx64() const {
+ return isMem64() && isSrcIdx();
+ }
+
+ bool isDstIdx() const {
+ return !getMemIndexReg() && getMemScale() == 1 &&
+ (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+ (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+ getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+ cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+ }
+ bool isDstIdx8() const {
+ return isMem8() && isDstIdx();
+ }
+ bool isDstIdx16() const {
+ return isMem16() && isDstIdx();
+ }
+ bool isDstIdx32() const {
+ return isMem32() && isDstIdx();
+ }
+ bool isDstIdx64() const {
+ return isMem64() && isDstIdx();
+ }
+
+ bool isMemOffs() const {
+ return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() &&
+ getMemScale() == 1;
+ }
+
+ bool isMemOffs16_8() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs16_16() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs16_32() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs32_8() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs32_16() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs32_32() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs32_64() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMemOffs64_8() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs64_16() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs64_32() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs64_64() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
+ }
+
+ bool isReg() const override { return Kind == Register; }
+
+ bool isGR32orGR64() const {
+ return Kind == Register &&
+ (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ static unsigned getGR32FromGR64(unsigned RegNo) {
+ switch (RegNo) {
+ default: llvm_unreachable("Unexpected register");
+ case X86::RAX: return X86::EAX;
+ case X86::RCX: return X86::ECX;
+ case X86::RDX: return X86::EDX;
+ case X86::RBX: return X86::EBX;
+ case X86::RBP: return X86::EBP;
+ case X86::RSP: return X86::ESP;
+ case X86::RSI: return X86::ESI;
+ case X86::RDI: return X86::EDI;
+ case X86::R8: return X86::R8D;
+ case X86::R9: return X86::R9D;
+ case X86::R10: return X86::R10D;
+ case X86::R11: return X86::R11D;
+ case X86::R12: return X86::R12D;
+ case X86::R13: return X86::R13D;
+ case X86::R14: return X86::R14D;
+ case X86::R15: return X86::R15D;
+ case X86::RIP: return X86::EIP;
+ }
+ }
+
+ void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned RegNo = getReg();
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+ RegNo = getGR32FromGR64(RegNo);
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ }
+ void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 5) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::createImm(getMemScale()));
+ Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+ addExpr(Inst, getMemDisp());
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+ }
+
+ void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 2) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+ void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ }
+
+ void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 2) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
+ SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+ auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand>
+ CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+ bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+ StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+ auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+ Res->Reg.RegNo = RegNo;
+ Res->AddressOf = AddressOf;
+ Res->OffsetOfLoc = OffsetOfLoc;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+ SMLoc StartLoc, SMLoc EndLoc) {
+ auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+ Res->Imm.Val = Val;
+ return Res;
+ }
+
+ /// Create an absolute memory operand.
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = nullptr) {
+ auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = 0;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = 0;
+ Res->Mem.IndexReg = 0;
+ Res->Mem.Scale = 1;
+ Res->Mem.Size = Size;
+ Res->Mem.ModeSize = ModeSize;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+
+ /// Create a generalized memory operand.
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
+ SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = nullptr) {
+ // We should never just have a displacement, that should be parsed as an
+ // absolute memory operand.
+ assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+ // The scale should always be one of {1,2,4,8}.
+ assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+ "Invalid scale!");
+ auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = SegReg;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = BaseReg;
+ Res->Mem.IndexReg = IndexReg;
+ Res->Mem.Scale = Scale;
+ Res->Mem.Size = Size;
+ Res->Mem.ModeSize = ModeSize;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 0000000..ce8fcf1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,1009 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+// MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+ const char *s) {
+ dbgs() << file << ":" << line << ": " << s;
+}
+
+const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+ const void *mii) {
+ const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+ return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy. These constants are never actually
+// assigned; they are just filler to make an automatically-generated switch
+// statement work.
+namespace X86 {
+ enum {
+ BX_SI = 500,
+ BX_DI = 501,
+ BP_SI = 502,
+ BP_DI = 503,
+ sib = 504,
+ sib64 = 505
+ };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static bool translateInstruction(MCInst &target,
+ InternalInstruction &source,
+ const MCDisassembler *Dis);
+
+X86GenericDisassembler::X86GenericDisassembler(
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII)
+ : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
+ const FeatureBitset &FB = STI.getFeatureBits();
+ if (FB[X86::Mode16Bit]) {
+ fMode = MODE_16BIT;
+ return;
+ } else if (FB[X86::Mode32Bit]) {
+ fMode = MODE_32BIT;
+ return;
+ } else if (FB[X86::Mode64Bit]) {
+ fMode = MODE_64BIT;
+ return;
+ }
+
+ llvm_unreachable("Invalid CPU mode");
+}
+
+namespace {
+struct Region {
+ ArrayRef<uint8_t> Bytes;
+ uint64_t Base;
+ Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
+};
+} // end anonymous namespace
+
+/// A callback function that wraps the readByte method from Region.
+///
+/// @param Arg - The generic callback parameter. In this case, this should
+/// be a pointer to a Region.
+/// @param Byte - A pointer to the byte to be read.
+/// @param Address - The address to be read.
+static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
+ auto *R = static_cast<const Region *>(Arg);
+ ArrayRef<uint8_t> Bytes = R->Bytes;
+ unsigned Index = Address - R->Base;
+ if (Bytes.size() <= Index)
+ return -1;
+ *Byte = Bytes[Index];
+ return 0;
+}
+
+/// logger - a callback function that wraps the operator<< method from
+/// raw_ostream.
+///
+/// @param arg - The generic callback parameter. This should be a pointe
+/// to a raw_ostream.
+/// @param log - A string to be logged. logger() adds a newline.
+static void logger(void* arg, const char* log) {
+ if (!arg)
+ return;
+
+ raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+ vStream << log << "\n";
+}
+
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
+ MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream, raw_ostream &CStream) const {
+ CommentStream = &CStream;
+
+ InternalInstruction InternalInstr;
+
+ dlog_t LoggerFn = logger;
+ if (&VStream == &nulls())
+ LoggerFn = nullptr; // Disable logging completely if it's going to nulls().
+
+ Region R(Bytes, Address);
+
+ int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R,
+ LoggerFn, (void *)&VStream,
+ (const void *)MII.get(), Address, fMode);
+
+ if (Ret) {
+ Size = InternalInstr.readerCursor - Address;
+ return Fail;
+ } else {
+ Size = InternalInstr.length;
+ return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail;
+ }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+/// register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param reg - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+ uint8_t llvmRegnums[] = {
+ ALL_REGS
+ 0
+ };
+#undef ENTRY
+
+ uint8_t llvmRegnum = llvmRegnums[reg];
+ mcInst.addOperand(MCOperand::createReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value - The immediate Value, has had any PC adjustment made by
+/// the caller.
+/// @param isBranch - If the instruction is a branch instruction
+/// @param Address - The starting address of the instruction
+/// @param Offset - The byte offset to this immediate in the instruction
+/// @param Width - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width. If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created. This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Dis) {
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+ Offset, Width);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool. The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry. The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+ 0, // SEG_OVERRIDE_NONE
+ X86::CS,
+ X86::SS,
+ X86::DS,
+ X86::ES,
+ X86::FS,
+ X86::GS
+};
+
+/// translateSrcIndex - Appends a source index operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+ unsigned baseRegNo;
+
+ if (insn.mode == MODE_64BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+ else if (insn.mode == MODE_32BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+ else {
+ assert(insn.mode == MODE_16BIT);
+ baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+ }
+ MCOperand baseReg = MCOperand::createReg(baseRegNo);
+ mcInst.addOperand(baseReg);
+
+ MCOperand segmentReg;
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateDstIndex - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+ unsigned baseRegNo;
+
+ if (insn.mode == MODE_64BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+ else if (insn.mode == MODE_32BIT)
+ baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+ else {
+ assert(insn.mode == MODE_16BIT);
+ baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+ }
+ MCOperand baseReg = MCOperand::createReg(baseRegNo);
+ mcInst.addOperand(baseReg);
+ return false;
+}
+
+/// translateImmediate - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param immediate - The immediate value to append.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+ const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Sign-extend the immediate if necessary.
+
+ OperandType type = (OperandType)operand.type;
+
+ bool isBranch = false;
+ uint64_t pcrel = 0;
+ if (type == TYPE_RELv) {
+ isBranch = true;
+ pcrel = insn.startLocation +
+ insn.immediateOffset + insn.immediateSize;
+ switch (insn.displacementSize) {
+ default:
+ break;
+ case 1:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case 2:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case 4:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case 8:
+ break;
+ }
+ }
+ // By default sign-extend all X86 immediates based on their encoding.
+ else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+ type == TYPE_IMM64 || type == TYPE_IMMv) {
+ switch (operand.encoding) {
+ default:
+ break;
+ case ENCODING_IB:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case ENCODING_IW:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case ENCODING_ID:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case ENCODING_IO:
+ break;
+ }
+ } else if (type == TYPE_IMM3) {
+ // Check for immediates that printSSECC can't handle.
+ if (immediate >= 8) {
+ unsigned NewOpc;
+ switch (mcInst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break;
+ case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break;
+ case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break;
+ case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break;
+ case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break;
+ case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break;
+ case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break;
+ case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break;
+ case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break;
+ case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break;
+ case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break;
+ case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break;
+ case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break;
+ case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break;
+ case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break;
+ case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break;
+ case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
+ case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
+ case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
+ case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
+ case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
+ case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
+ case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
+ case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
+ }
+ // Switch opcode to the one that doesn't get special printing.
+ mcInst.setOpcode(NewOpc);
+ }
+ } else if (type == TYPE_IMM5) {
+ // Check for immediates that printAVXCC can't handle.
+ if (immediate >= 32) {
+ unsigned NewOpc;
+ switch (mcInst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break;
+ case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break;
+ case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break;
+ case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break;
+ case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break;
+ case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break;
+ case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break;
+ case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break;
+ case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break;
+ case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break;
+ case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break;
+ case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break;
+ case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break;
+ case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break;
+ case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break;
+ case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break;
+ case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break;
+ case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
+ case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break;
+ case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break;
+ case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break;
+ case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break;
+ }
+ // Switch opcode to the one that doesn't get special printing.
+ mcInst.setOpcode(NewOpc);
+ }
+ } else if (type == TYPE_AVX512ICC) {
+ if (immediate >= 8 || ((immediate & 0x3) == 3)) {
+ unsigned NewOpc;
+ switch (mcInst.getOpcode()) {
+ default: llvm_unreachable("unexpected opcode");
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break;
+ case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break;
+ case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break;
+ case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break;
+ case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break;
+ case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break;
+ case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break;
+ case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break;
+ case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break;
+ case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break;
+ case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break;
+ case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break;
+ case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break;
+ case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break;
+ case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break;
+ case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
+ case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break;
+ case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break;
+ case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break;
+ case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break;
+ case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break;
+ case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
+ case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break;
+ case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break;
+ case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break;
+ case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break;
+ case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break;
+ case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break;
+ case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break;
+ case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break;
+ case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break;
+ case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break;
+ case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break;
+ case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
+ case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break;
+ case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break;
+ case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break;
+ case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break;
+ case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break;
+ case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
+ case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break;
+ case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break;
+ case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break;
+ case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break;
+ case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break;
+ case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break;
+ case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break;
+ case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break;
+ case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break;
+ case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break;
+ case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break;
+ case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break;
+ case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break;
+ case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break;
+ case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break;
+ case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break;
+ case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break;
+ case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break;
+ case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break;
+ case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break;
+ case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break;
+ }
+ // Switch opcode to the one that doesn't get special printing.
+ mcInst.setOpcode(NewOpc);
+ }
+ }
+
+ switch (type) {
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
+ return;
+ case TYPE_XMM256:
+ mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
+ return;
+ case TYPE_XMM512:
+ mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
+ return;
+ case TYPE_BNDR:
+ mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
+ case TYPE_REL8:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ if (immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case TYPE_REL16:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ if (immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case TYPE_REL32:
+ case TYPE_REL64:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ default:
+ // operand is 64 bits wide. Do nothing.
+ break;
+ }
+
+ if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+ insn.immediateOffset, insn.immediateSize,
+ mcInst, Dis))
+ mcInst.addOperand(MCOperand::createImm(immediate));
+
+ if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
+ type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+ MCOperand segmentReg;
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+ mcInst.addOperand(segmentReg);
+ }
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+/// ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction to extract the R/M field
+/// from.
+/// @return - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+ InternalInstruction &insn) {
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ debug("A R/M register operand may not have a SIB byte");
+ return true;
+ }
+
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected EA base register");
+ return true;
+ case EA_BASE_NONE:
+ debug("EA_BASE_NONE for ModR/M base");
+ return true;
+#define ENTRY(x) case EA_BASE_##x:
+ ALL_EA_BASES
+#undef ENTRY
+ debug("A R/M register operand may not have a base; "
+ "the operand must be a register.");
+ return true;
+#define ENTRY(x) \
+ case EA_REG_##x: \
+ mcInst.addOperand(MCOperand::createReg(X86::x)); break;
+ ALL_REGS
+#undef ENTRY
+ }
+
+ return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+/// fields of an internal instruction (and possibly its SIB byte) to a memory
+/// operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Addresses in an MCInst are represented as five operands:
+ // 1. basereg (register) The R/M base, or (if there is a SIB) the
+ // SIB base
+ // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified
+ // scale amount
+ // 3. indexreg (register) x86_registerNONE, or (if there is a SIB)
+ // the index (which is multiplied by the
+ // scale amount)
+ // 4. displacement (immediate) 0, or the displacement if there is one
+ // 5. segmentreg (register) x86_registerNONE for now, but could be set
+ // if we have segment overrides
+
+ MCOperand baseReg;
+ MCOperand scaleAmount;
+ MCOperand indexReg;
+ MCOperand displacement;
+ MCOperand segmentReg;
+ uint64_t pcrel = 0;
+
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ if (insn.sibBase != SIB_BASE_NONE) {
+ switch (insn.sibBase) {
+ default:
+ debug("Unexpected sibBase");
+ return true;
+#define ENTRY(x) \
+ case SIB_BASE_##x: \
+ baseReg = MCOperand::createReg(X86::x); break;
+ ALL_SIB_BASES
+#undef ENTRY
+ }
+ } else {
+ baseReg = MCOperand::createReg(0);
+ }
+
+ // Check whether we are handling VSIB addressing mode for GATHER.
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+ // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+ // I don't see a way to get the correct IndexReg in readSIB:
+ // We can tell whether it is VSIB or SIB after instruction ID is decoded,
+ // but instruction ID may not be decoded yet when calling readSIB.
+ uint32_t Opcode = mcInst.getOpcode();
+ bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+ Opcode == X86::VGATHERDPDYrm ||
+ Opcode == X86::VGATHERQPDrm ||
+ Opcode == X86::VGATHERDPSrm ||
+ Opcode == X86::VGATHERQPSrm ||
+ Opcode == X86::VPGATHERDQrm ||
+ Opcode == X86::VPGATHERDQYrm ||
+ Opcode == X86::VPGATHERQQrm ||
+ Opcode == X86::VPGATHERDDrm ||
+ Opcode == X86::VPGATHERQDrm);
+ bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+ Opcode == X86::VGATHERDPSYrm ||
+ Opcode == X86::VGATHERQPSYrm ||
+ Opcode == X86::VGATHERDPDZrm ||
+ Opcode == X86::VPGATHERDQZrm ||
+ Opcode == X86::VPGATHERQQYrm ||
+ Opcode == X86::VPGATHERDDYrm ||
+ Opcode == X86::VPGATHERQDYrm);
+ bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
+ Opcode == X86::VGATHERDPSZrm ||
+ Opcode == X86::VGATHERQPSZrm ||
+ Opcode == X86::VPGATHERQQZrm ||
+ Opcode == X86::VPGATHERDDZrm ||
+ Opcode == X86::VPGATHERQDZrm);
+ if (IndexIs128 || IndexIs256 || IndexIs512) {
+ unsigned IndexOffset = insn.sibIndex -
+ (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+ SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
+ IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+ insn.sibIndex = (SIBIndex)(IndexBase +
+ (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+ }
+
+ if (insn.sibIndex != SIB_INDEX_NONE) {
+ switch (insn.sibIndex) {
+ default:
+ debug("Unexpected sibIndex");
+ return true;
+#define ENTRY(x) \
+ case SIB_INDEX_##x: \
+ indexReg = MCOperand::createReg(X86::x); break;
+ EA_BASES_32BIT
+ EA_BASES_64BIT
+ REGS_XMM
+ REGS_YMM
+ REGS_ZMM
+#undef ENTRY
+ }
+ } else {
+ indexReg = MCOperand::createReg(0);
+ }
+
+ scaleAmount = MCOperand::createImm(insn.sibScale);
+ } else {
+ switch (insn.eaBase) {
+ case EA_BASE_NONE:
+ if (insn.eaDisplacement == EA_DISP_NONE) {
+ debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+ return true;
+ }
+ if (insn.mode == MODE_64BIT){
+ pcrel = insn.startLocation +
+ insn.displacementOffset + insn.displacementSize;
+ tryAddingPcLoadReferenceComment(insn.startLocation +
+ insn.displacementOffset,
+ insn.displacement + pcrel, Dis);
+ baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6
+ }
+ else
+ baseReg = MCOperand::createReg(0);
+
+ indexReg = MCOperand::createReg(0);
+ break;
+ case EA_BASE_BX_SI:
+ baseReg = MCOperand::createReg(X86::BX);
+ indexReg = MCOperand::createReg(X86::SI);
+ break;
+ case EA_BASE_BX_DI:
+ baseReg = MCOperand::createReg(X86::BX);
+ indexReg = MCOperand::createReg(X86::DI);
+ break;
+ case EA_BASE_BP_SI:
+ baseReg = MCOperand::createReg(X86::BP);
+ indexReg = MCOperand::createReg(X86::SI);
+ break;
+ case EA_BASE_BP_DI:
+ baseReg = MCOperand::createReg(X86::BP);
+ indexReg = MCOperand::createReg(X86::DI);
+ break;
+ default:
+ indexReg = MCOperand::createReg(0);
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected eaBase");
+ return true;
+ // Here, we will use the fill-ins defined above. However,
+ // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+ // sib and sib64 were handled in the top-level if, so they're only
+ // placeholders to keep the compiler happy.
+#define ENTRY(x) \
+ case EA_BASE_##x: \
+ baseReg = MCOperand::createReg(X86::x); break;
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+ ALL_REGS
+#undef ENTRY
+ debug("A R/M memory operand may not be a register; "
+ "the base field must be a base.");
+ return true;
+ }
+ }
+
+ scaleAmount = MCOperand::createImm(1);
+ }
+
+ displacement = MCOperand::createImm(insn.displacement);
+
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+
+ mcInst.addOperand(baseReg);
+ mcInst.addOperand(scaleAmount);
+ mcInst.addOperand(indexReg);
+ if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+ insn.startLocation, insn.displacementOffset,
+ insn.displacementSize, mcInst, Dis))
+ mcInst.addOperand(displacement);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+/// byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn, const MCDisassembler *Dis) {
+ switch (operand.type) {
+ default:
+ debug("Unexpected type for a R/M operand");
+ return true;
+ case TYPE_R8:
+ case TYPE_R16:
+ case TYPE_R32:
+ case TYPE_R64:
+ case TYPE_Rv:
+ case TYPE_MM64:
+ case TYPE_XMM:
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ case TYPE_XMM256:
+ case TYPE_XMM512:
+ case TYPE_VK1:
+ case TYPE_VK2:
+ case TYPE_VK4:
+ case TYPE_VK8:
+ case TYPE_VK16:
+ case TYPE_VK32:
+ case TYPE_VK64:
+ case TYPE_DEBUGREG:
+ case TYPE_CONTROLREG:
+ case TYPE_BNDR:
+ return translateRMRegister(mcInst, insn);
+ case TYPE_M:
+ case TYPE_M8:
+ case TYPE_M16:
+ case TYPE_M32:
+ case TYPE_M64:
+ case TYPE_M128:
+ case TYPE_M256:
+ case TYPE_M512:
+ case TYPE_Mv:
+ case TYPE_M32FP:
+ case TYPE_M64FP:
+ case TYPE_M80FP:
+ case TYPE_M1616:
+ case TYPE_M1632:
+ case TYPE_M1664:
+ case TYPE_LEA:
+ return translateRMMemory(mcInst, insn, Dis);
+ }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param stackPos - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+ uint8_t stackPos) {
+ mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param maskRegNum - Number of mask register from 0 to 7.
+/// @return - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+ uint8_t maskRegNum) {
+ if (maskRegNum >= 8) {
+ debug("Invalid mask register number");
+ return true;
+ }
+
+ mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum));
+ return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+/// to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ switch (operand.encoding) {
+ default:
+ debug("Unhandled operand encoding during translation");
+ return true;
+ case ENCODING_REG:
+ translateRegister(mcInst, insn.reg);
+ return false;
+ case ENCODING_WRITEMASK:
+ return translateMaskRegister(mcInst, insn.writemask);
+ CASE_ENCODING_RM:
+ return translateRM(mcInst, operand, insn, Dis);
+ case ENCODING_CB:
+ case ENCODING_CW:
+ case ENCODING_CD:
+ case ENCODING_CP:
+ case ENCODING_CO:
+ case ENCODING_CT:
+ debug("Translation of code offsets isn't supported.");
+ return true;
+ case ENCODING_IB:
+ case ENCODING_IW:
+ case ENCODING_ID:
+ case ENCODING_IO:
+ case ENCODING_Iv:
+ case ENCODING_Ia:
+ translateImmediate(mcInst,
+ insn.immediates[insn.numImmediatesTranslated++],
+ operand,
+ insn,
+ Dis);
+ return false;
+ case ENCODING_SI:
+ return translateSrcIndex(mcInst, insn);
+ case ENCODING_DI:
+ return translateDstIndex(mcInst, insn);
+ case ENCODING_RB:
+ case ENCODING_RW:
+ case ENCODING_RD:
+ case ENCODING_RO:
+ case ENCODING_Rv:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_FP:
+ translateFPRegister(mcInst, insn.modRM & 7);
+ return false;
+ case ENCODING_VVVV:
+ translateRegister(mcInst, insn.vvvv);
+ return false;
+ case ENCODING_DUP:
+ return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+ insn, Dis);
+ }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+/// operands to an MCInst.
+///
+/// @param mcInst - The MCInst to populate with the instruction's data.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ if (!insn.spec) {
+ debug("Instruction has no specification");
+ return true;
+ }
+
+ mcInst.clear();
+ mcInst.setOpcode(insn.instructionID);
+ // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+ // prefix bytes should be disassembled as xrelease and xacquire then set the
+ // opcode to those instead of the rep and repne opcodes.
+ if (insn.xAcquireRelease) {
+ if(mcInst.getOpcode() == X86::REP_PREFIX)
+ mcInst.setOpcode(X86::XRELEASE_PREFIX);
+ else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+ mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+ }
+
+ insn.numImmediatesTranslated = 0;
+
+ for (const auto &Op : insn.operands) {
+ if (Op.encoding != ENCODING_NONE) {
+ if (translateOperand(mcInst, Op, insn, Dis)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static MCDisassembler *createX86Disassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+ return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
+}
+
+extern "C" void LLVMInitializeX86Disassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(TheX86_32Target,
+ createX86Disassembler);
+ TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+ createX86Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 0000000..d7f426b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -0,0 +1,112 @@
+//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets. The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+// These attributes, recorded in enum attributeBits
+// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM
+// provides a mapping from bitmasks to contexts, which are represented by
+// enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is. The
+// disassembler distinguishes four kinds of opcodes, which are enumerated in
+// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which
+// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get
+// a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+// ModR/M byte to complete decode. The ModRMDecision's type is an entry from
+// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+// ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in
+// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+// meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+// (X86DisassemblerDecoderCommon.h) and its type is an entry from
+// OperandType (ibid.). The encoding indicates how to read it from the
+// instruction; the type indicates how to interpret the value once it has
+// been read. For example, a register operand could be stored in the R/M
+// field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+// the main opcode. This is orthogonal from its meaning (an GPR or an XMM
+// register, for instance). Given this information, the operands can be
+// extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+// and operands into a format understandable by the client - in this case, an
+// MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself. The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+// adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+// invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+// table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+// factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+// responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MemoryObject;
+class raw_ostream;
+
+namespace X86Disassembler {
+
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+ std::unique_ptr<const MCInstrInfo> MII;
+public:
+ X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII);
+public:
+ DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &vStream,
+ raw_ostream &cStream) const override;
+
+private:
+ DisassemblerMode fMode;
+};
+
+} // namespace X86Disassembler
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
new file mode 100644
index 0000000..040143b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -0,0 +1,1909 @@
+//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdarg> /* for va_*() */
+#include <cstdio> /* for vsnprintf() */
+#include <cstdlib> /* for exit() */
+#include <cstring> /* for memset() */
+
+#include "X86DisassemblerDecoder.h"
+
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to. Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+ uint8_t modrm_type;
+ uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+ ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes). Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes. Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+ OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+#ifndef NDEBUG
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table. Takes a set of
+ * attributes and returns the appropriate decode context.
+ *
+ * @param attrMask - Attributes, from the enumeration attributeBits.
+ * @return - The InstructionContext to use when looking up an
+ * an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint16_t attrMask) {
+ return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ * the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ * contextForAttrs.
+ * @param opcode - The last byte of the instruction's opcode, not counting
+ * ModR/M extensions and escapes.
+ * @return - true if the ModR/M byte is required, false otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+ InstructionContext insnContext,
+ uint16_t opcode) {
+ const struct ContextDecision* decision = nullptr;
+
+ switch (type) {
+ case ONEBYTE:
+ decision = &ONEBYTE_SYM;
+ break;
+ case TWOBYTE:
+ decision = &TWOBYTE_SYM;
+ break;
+ case THREEBYTE_38:
+ decision = &THREEBYTE38_SYM;
+ break;
+ case THREEBYTE_3A:
+ decision = &THREEBYTE3A_SYM;
+ break;
+ case XOP8_MAP:
+ decision = &XOP8_MAP_SYM;
+ break;
+ case XOP9_MAP:
+ decision = &XOP9_MAP_SYM;
+ break;
+ case XOPA_MAP:
+ decision = &XOPA_MAP_SYM;
+ break;
+ }
+
+ return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+ modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ * an instruction.
+ *
+ * @param type - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode - See modRMRequired().
+ * @param modRM - The ModR/M byte if required, or any value if not.
+ * @return - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+ InstructionContext insnContext,
+ uint8_t opcode,
+ uint8_t modRM) {
+ const struct ModRMDecision* dec = nullptr;
+
+ switch (type) {
+ case ONEBYTE:
+ dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case TWOBYTE:
+ dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_38:
+ dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_3A:
+ dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP8_MAP:
+ dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP9_MAP:
+ dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOPA_MAP:
+ dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ }
+
+ switch (dec->modrm_type) {
+ default:
+ debug("Corrupt table! Unknown modrm_type");
+ return 0;
+ case MODRM_ONEENTRY:
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITRM:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+1];
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITREG:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+ case MODRM_SPLITMISC:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+ case MODRM_FULL:
+ return modRMTable[dec->instructionIDs+modRM];
+ }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ * that instruction.
+ *
+ * @param uid - The unique ID for the instruction. This should be returned by
+ * decode(); specifierForUID will not check bounds.
+ * @return - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+ return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ * byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn - The instruction with the reader function to use. The cursor
+ * for this instruction is advanced.
+ * @param byte - A pointer to a pre-allocated memory buffer to be populated
+ * with the data read.
+ * @return - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+ int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+ if (!ret)
+ ++(insn->readerCursor);
+
+ return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn - See consumeByte().
+ * @param byte - See consumeByte().
+ * @return - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+ return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+ insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type) \
+ static int name(struct InternalInstruction* insn, type* ptr) { \
+ type combined = 0; \
+ unsigned offset; \
+ for (offset = 0; offset < sizeof(type); ++offset) { \
+ uint8_t byte; \
+ int ret = insn->reader(insn->readerArg, \
+ &byte, \
+ insn->readerCursor + offset); \
+ if (ret) \
+ return ret; \
+ combined = combined | ((uint64_t)byte << (offset * 8)); \
+ } \
+ *ptr = combined; \
+ insn->readerCursor += sizeof(type); \
+ return 0; \
+ }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ * values of various sizes from the instruction's memory and advance the
+ * cursor appropriately. These readers perform endian conversion.
+ *
+ * @param insn - See consumeByte().
+ * @param ptr - A pointer to a pre-allocated memory of appropriate size to
+ * be populated with the data read.
+ * @return - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ * message, typically without a carriage-return.
+ *
+ * @param insn - The instruction containing the logging function.
+ * @param format - See printf().
+ * @param ... - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+ const char* format,
+ ...) {
+ char buffer[256];
+ va_list ap;
+
+ if (!insn->dlog)
+ return;
+
+ va_start(ap, format);
+ (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+ va_end(ap);
+
+ insn->dlog(insn->dlogArg, buffer);
+
+ return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ * location.
+ *
+ * @param insn - The instruction to be marked as having the prefix.
+ * @param prefix - The prefix that is present.
+ * @param location - The location where the prefix is located (in the address
+ * space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ insn->prefixPresent[prefix] = 1;
+ insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ * present at a given location.
+ *
+ * @param insn - The instruction to be queried.
+ * @param prefix - The prefix.
+ * @param location - The location to query.
+ * @return - Whether the prefix is at that location.
+ */
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ return insn->prefixPresent[prefix] == 1 &&
+ insn->prefixLocations[prefix] == location;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ * instruction as having them. Also sets the instruction's default operand,
+ * address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn - The instruction whose prefixes are to be read.
+ * @return - 0 if the instruction could be read until the end of the prefix
+ * bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+ bool isPrefix = true;
+ bool prefixGroups[4] = { false };
+ uint64_t prefixLocation;
+ uint8_t byte = 0;
+ uint8_t nextByte;
+
+ bool hasAdSize = false;
+ bool hasOpSize = false;
+
+ dbgprintf(insn, "readPrefixes()");
+
+ while (isPrefix) {
+ prefixLocation = insn->readerCursor;
+
+ /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
+ if (consumeByte(insn, &byte))
+ break;
+
+ /*
+ * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+ * break and let it be disassembled as a normal "instruction".
+ */
+ if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+ break;
+
+ if (insn->readerCursor - 1 == insn->startLocation
+ && (byte == 0xf2 || byte == 0xf3)
+ && !lookAtByte(insn, &nextByte))
+ {
+ /*
+ * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+ * met:
+ * - it is followed by a LOCK (0xf0) prefix
+ * - it is followed by an xchg instruction
+ * then it should be disassembled as a xacquire/xrelease not repne/rep.
+ */
+ if ((byte == 0xf2 || byte == 0xf3) &&
+ ((nextByte == 0xf0) ||
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+ insn->xAcquireRelease = true;
+ /*
+ * Also if the byte is 0xf3, and the following condition is met:
+ * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+ * "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+ * then it should be disassembled as an xrelease not rep.
+ */
+ if (byte == 0xf3 &&
+ (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7))
+ insn->xAcquireRelease = true;
+ if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+ if (consumeByte(insn, &nextByte))
+ return -1;
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ unconsumeByte(insn);
+ }
+ if (nextByte != 0x0f && nextByte != 0x90)
+ break;
+ }
+
+ switch (byte) {
+ case 0xf0: /* LOCK */
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP or REPE/REPZ */
+ if (prefixGroups[0])
+ dbgprintf(insn, "Redundant Group 1 prefix");
+ prefixGroups[0] = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x2e: /* CS segment override -OR- Branch not taken */
+ case 0x36: /* SS segment override -OR- Branch taken */
+ case 0x3e: /* DS segment override */
+ case 0x26: /* ES segment override */
+ case 0x64: /* FS segment override */
+ case 0x65: /* GS segment override */
+ switch (byte) {
+ case 0x2e:
+ insn->segmentOverride = SEG_OVERRIDE_CS;
+ break;
+ case 0x36:
+ insn->segmentOverride = SEG_OVERRIDE_SS;
+ break;
+ case 0x3e:
+ insn->segmentOverride = SEG_OVERRIDE_DS;
+ break;
+ case 0x26:
+ insn->segmentOverride = SEG_OVERRIDE_ES;
+ break;
+ case 0x64:
+ insn->segmentOverride = SEG_OVERRIDE_FS;
+ break;
+ case 0x65:
+ insn->segmentOverride = SEG_OVERRIDE_GS;
+ break;
+ default:
+ debug("Unhandled override");
+ return -1;
+ }
+ if (prefixGroups[1])
+ dbgprintf(insn, "Redundant Group 2 prefix");
+ prefixGroups[1] = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x66: /* Operand-size override */
+ if (prefixGroups[2])
+ dbgprintf(insn, "Redundant Group 3 prefix");
+ prefixGroups[2] = true;
+ hasOpSize = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x67: /* Address-size override */
+ if (prefixGroups[3])
+ dbgprintf(insn, "Redundant Group 4 prefix");
+ prefixGroups[3] = true;
+ hasAdSize = true;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ default: /* Not a prefix byte */
+ isPrefix = false;
+ break;
+ }
+
+ if (isPrefix)
+ dbgprintf(insn, "Found prefix 0x%hhx", byte);
+ }
+
+ insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+ if (byte == 0x62) {
+ uint8_t byte1, byte2;
+
+ if (consumeByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+ return -1;
+ }
+
+ if (lookAtByte(insn, &byte2)) {
+ dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+
+ if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+ ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+ insn->vectorExtensionType = TYPE_EVEX;
+ } else {
+ unconsumeByte(insn); /* unconsume byte1 */
+ unconsumeByte(insn); /* unconsume byte */
+ insn->necessaryPrefixLocation = insn->readerCursor - 2;
+ }
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ insn->vectorExtensionPrefix[0] = byte;
+ insn->vectorExtensionPrefix[1] = byte1;
+ if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+ dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+ if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+ dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+ return -1;
+ }
+
+ /* We simulate the REX prefix for simplicity's sake */
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+ | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+ | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+ | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+ }
+ } else if (byte == 0xc4) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ insn->vectorExtensionType = TYPE_VEX_3B;
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+
+ if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+ consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+ /* We simulate the REX prefix for simplicity's sake */
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+ | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+ | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+ | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]);
+ }
+ } else if (byte == 0xc5) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ insn->vectorExtensionType = TYPE_VEX_2B;
+ } else {
+ unconsumeByte(insn);
+ }
+
+ if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+ }
+
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ hasOpSize = true;
+ break;
+ }
+
+ dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1]);
+ }
+ } else if (byte == 0x8f) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of XOP");
+ return -1;
+ }
+
+ if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+ insn->vectorExtensionType = TYPE_XOP;
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+
+ if (insn->vectorExtensionType == TYPE_XOP) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+ consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+ /* We simulate the REX prefix for simplicity's sake */
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+ | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+ | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+ | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ hasOpSize = true;
+ break;
+ }
+
+ dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]);
+ }
+ } else {
+ if (insn->mode == MODE_64BIT) {
+ if ((byte & 0xf0) == 0x40) {
+ uint8_t opcodeByte;
+
+ if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+ dbgprintf(insn, "Redundant REX prefix");
+ return -1;
+ }
+
+ insn->rexPrefix = byte;
+ insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+ dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ }
+
+ if (insn->mode == MODE_16BIT) {
+ insn->registerSize = (hasOpSize ? 4 : 2);
+ insn->addressSize = (hasAdSize ? 4 : 2);
+ insn->displacementSize = (hasAdSize ? 4 : 2);
+ insn->immediateSize = (hasOpSize ? 4 : 2);
+ } else if (insn->mode == MODE_32BIT) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 2 : 4);
+ insn->displacementSize = (hasAdSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else if (insn->mode == MODE_64BIT) {
+ if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+ insn->registerSize = 8;
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = 4;
+ insn->immediateSize = 4;
+ } else if (insn->rexPrefix) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ * extended or escape opcodes).
+ *
+ * @param insn - The instruction whose opcode is to be read.
+ * @return - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+ /* Determine the length of the primary opcode */
+
+ uint8_t current;
+
+ dbgprintf(insn, "readOpcode()");
+
+ insn->opcodeType = ONEBYTE;
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+ default:
+ dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+ mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+ return -1;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consumeByte(insn, &insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+ return -1;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consumeByte(insn, &insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+ return -1;
+ case XOP_MAP_SELECT_8:
+ insn->opcodeType = XOP8_MAP;
+ return consumeByte(insn, &insn->opcode);
+ case XOP_MAP_SELECT_9:
+ insn->opcodeType = XOP9_MAP;
+ return consumeByte(insn, &insn->opcode);
+ case XOP_MAP_SELECT_A:
+ insn->opcodeType = XOPA_MAP;
+ return consumeByte(insn, &insn->opcode);
+ }
+ }
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x0f) {
+ dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x38) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_38;
+ } else if (current == 0x3a) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_3A;
+ } else {
+ dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+ insn->opcodeType = TWOBYTE;
+ }
+ }
+
+ /*
+ * At this point we have consumed the full opcode.
+ * Anything we consume from here on must be unconsumed.
+ */
+
+ insn->opcode = current;
+
+ return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ * the ModR/M byte as appropriate for extended and escape opcodes,
+ * and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ * instruction.
+ * @param insn - The instruction whose ID is to be determined.
+ * @param attrMask - The attribute mask to search.
+ * @return - 0 if the ModR/M could be read when needed or was not
+ * needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+ struct InternalInstruction* insn,
+ uint16_t attrMask) {
+ bool hasModRMExtension;
+
+ InstructionContext instructionClass = contextForAttrs(attrMask);
+
+ hasModRMExtension = modRMRequired(insn->opcodeType,
+ instructionClass,
+ insn->opcode);
+
+ if (hasModRMExtension) {
+ if (readModRM(insn))
+ return -1;
+
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ insn->modRM);
+ } else {
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ 0);
+ }
+
+ return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static bool is16BitEquivalent(const char* orig, const char* equiv) {
+ off_t i;
+
+ for (i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return true;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return false;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+ continue;
+ if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+ continue;
+ return false;
+ }
+ }
+}
+
+/*
+ * is64Bit - Determines whether this instruction is a 64-bit instruction.
+ *
+ * @param name - The instruction that is not 16-bit
+ */
+static bool is64Bit(const char* name) {
+ off_t i;
+
+ for (i = 0;; ++i) {
+ if (name[i] == '\0')
+ return false;
+ if (name[i] == '6' && name[i+1] == '4')
+ return true;
+ }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ * appropriate for extended and escape opcodes. Determines the attributes and
+ * context for the instruction before doing so.
+ *
+ * @param insn - The instruction whose ID is to be determined.
+ * @return - 0 if the ModR/M could be read when needed or was not needed;
+ * nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+ uint16_t attrMask;
+ uint16_t instructionID;
+
+ dbgprintf(insn, "getID()");
+
+ attrMask = ATTR_NONE;
+
+ if (insn->mode == MODE_64BIT)
+ attrMask |= ATTR_64BIT;
+
+ if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXKZ;
+ if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXB;
+ if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXK;
+ if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXL;
+ if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXL2;
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else {
+ return -1;
+ }
+ } else {
+ if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_OPSIZE;
+ else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_ADSIZE;
+ else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XS;
+ else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XD;
+ }
+
+ if (insn->rexPrefix & 0x08)
+ attrMask |= ATTR_REXW;
+
+ /*
+ * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+ * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+ */
+ if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE &&
+ insn->opcode == 0xE3)
+ attrMask ^= ATTR_ADSIZE;
+
+ /*
+ * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix
+ * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
+ */
+
+ if (insn->mode == MODE_64BIT &&
+ isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+ switch (insn->opcode) {
+ case 0xE8:
+ case 0xE9:
+ // Take care of psubsb and other mmx instructions.
+ if (insn->opcodeType == ONEBYTE) {
+ attrMask ^= ATTR_OPSIZE;
+ insn->immediateSize = 4;
+ insn->displacementSize = 4;
+ }
+ break;
+ case 0x82:
+ case 0x83:
+ case 0x84:
+ case 0x85:
+ case 0x86:
+ case 0x87:
+ case 0x88:
+ case 0x89:
+ case 0x8A:
+ case 0x8B:
+ case 0x8C:
+ case 0x8D:
+ case 0x8E:
+ case 0x8F:
+ // Take care of lea and three byte ops.
+ if (insn->opcodeType == TWOBYTE) {
+ attrMask ^= ATTR_OPSIZE;
+ insn->immediateSize = 4;
+ insn->displacementSize = 4;
+ }
+ break;
+ }
+ }
+
+ if (getIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ /* The following clauses compensate for limitations of the tables. */
+
+ if (insn->mode != MODE_64BIT &&
+ insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ /*
+ * The tables can't distinquish between cases where the W-bit is used to
+ * select register size and cases where its a required part of the opcode.
+ */
+ if ((insn->vectorExtensionType == TYPE_EVEX &&
+ wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_VEX_3B &&
+ wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_XOP &&
+ wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+ uint16_t instructionIDWithREXW;
+ if (getIDWithAttrMask(&instructionIDWithREXW,
+ insn, attrMask | ATTR_REXW)) {
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(instructionID);
+ return 0;
+ }
+
+ const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg);
+ // If not a 64-bit instruction. Switch the opcode.
+ if (!is64Bit(SpecName)) {
+ insn->instructionID = instructionIDWithREXW;
+ insn->spec = specifierForUID(instructionIDWithREXW);
+ return 0;
+ }
+ }
+ }
+
+ /*
+ * Absolute moves need special handling.
+ * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+ * inverted w.r.t.
+ * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+ * any position.
+ */
+ if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+ /* Make sure we observed the prefixes in any position. */
+ if (insn->prefixPresent[0x67])
+ attrMask |= ATTR_ADSIZE;
+ if (insn->prefixPresent[0x66])
+ attrMask |= ATTR_OPSIZE;
+
+ /* In 16-bit, invert the attributes. */
+ if (insn->mode == MODE_16BIT)
+ attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+
+ if (getIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(instructionID);
+ return 0;
+ }
+
+ if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+ !(attrMask & ATTR_OPSIZE)) {
+ /*
+ * The instruction tables make no distinction between instructions that
+ * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+ * particular spot (i.e., many MMX operations). In general we're
+ * conservative, but in the specific case where OpSize is present but not
+ * in the right place we check if there's a 16-bit operation.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithOpsize;
+ const char *specName, *specWithOpSizeName;
+
+ spec = specifierForUID(instructionID);
+
+ if (getIDWithAttrMask(&instructionIDWithOpsize,
+ insn,
+ attrMask | ATTR_OPSIZE)) {
+ /*
+ * ModRM required with OpSize but not present; give up and return version
+ * without OpSize set
+ */
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specName = GetInstrName(instructionID, miiArg);
+ specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
+
+ if (is16BitEquivalent(specName, specWithOpSizeName) &&
+ (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+ insn->instructionID = instructionIDWithOpsize;
+ insn->spec = specifierForUID(instructionIDWithOpsize);
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+ insn->rexPrefix & 0x01) {
+ /*
+ * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+ * it should decode as XCHG %r8, %eax.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithNewOpcode;
+ const struct InstructionSpecifier *specWithNewOpcode;
+
+ spec = specifierForUID(instructionID);
+
+ /* Borrow opcode from one of the other XCHGar opcodes */
+ insn->opcode = 0x91;
+
+ if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+ insn,
+ attrMask)) {
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+ /* Change back */
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionIDWithNewOpcode;
+ insn->spec = specWithNewOpcode;
+
+ return 0;
+ }
+
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(insn->instructionID);
+
+ return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ * instruction.
+ *
+ * @param insn - The instruction whose SIB byte is to be read.
+ * @return - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+ SIBIndex sibIndexBase = SIB_INDEX_NONE;
+ SIBBase sibBaseBase = SIB_BASE_NONE;
+ uint8_t index, base;
+
+ dbgprintf(insn, "readSIB()");
+
+ if (insn->consumedSIB)
+ return 0;
+
+ insn->consumedSIB = true;
+
+ switch (insn->addressSize) {
+ case 2:
+ dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+ return -1;
+ case 4:
+ sibIndexBase = SIB_INDEX_EAX;
+ sibBaseBase = SIB_BASE_EAX;
+ break;
+ case 8:
+ sibIndexBase = SIB_INDEX_RAX;
+ sibBaseBase = SIB_BASE_RAX;
+ break;
+ }
+
+ if (consumeByte(insn, &insn->sib))
+ return -1;
+
+ index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+ // FIXME: The fifth bit (bit index 4) is only to be used for instructions
+ // that understand VSIB indexing. ORing the bit in here is mildy dangerous
+ // because performing math on an 'enum SIBIndex' can produce garbage.
+ // Excluding the "none" value, it should cover 6 spaces of register names:
+ // - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI
+ // - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX
+ // - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX
+ // - 32 possibilities for each of XMM, YMM, ZMM registers
+ // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode,
+ // summing in a fully decoded index between 0 and 31 can end up with a value
+ // that looks like something in the low half of the XMM range.
+ // translateRMMemory() tries to reverse the damage, with only partial success,
+ // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt"
+ if (insn->vectorExtensionType == TYPE_EVEX)
+ index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
+
+ if (index == 0x4) {
+ insn->sibIndex = SIB_INDEX_NONE;
+ } else {
+ insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+ }
+
+ insn->sibScale = 1 << scaleFromSIB(insn->sib);
+
+ base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+ switch (base) {
+ case 0x5:
+ case 0xd:
+ switch (modFromModRM(insn->modRM)) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = SIB_BASE_NONE;
+ break;
+ case 0x1:
+ insn->eaDisplacement = EA_DISP_8;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ case 0x2:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ case 0x3:
+ debug("Cannot have Mod = 0b11 and a SIB byte");
+ return -1;
+ }
+ break;
+ default:
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn - The instruction whose displacement is to be read.
+ * @return - 0 if the displacement byte was successfully read; nonzero
+ * otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+ int8_t d8;
+ int16_t d16;
+ int32_t d32;
+
+ dbgprintf(insn, "readDisplacement()");
+
+ if (insn->consumedDisplacement)
+ return 0;
+
+ insn->consumedDisplacement = true;
+ insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+ switch (insn->eaDisplacement) {
+ case EA_DISP_NONE:
+ insn->consumedDisplacement = false;
+ break;
+ case EA_DISP_8:
+ if (consumeInt8(insn, &d8))
+ return -1;
+ insn->displacement = d8;
+ break;
+ case EA_DISP_16:
+ if (consumeInt16(insn, &d16))
+ return -1;
+ insn->displacement = d16;
+ break;
+ case EA_DISP_32:
+ if (consumeInt32(insn, &d32))
+ return -1;
+ insn->displacement = d32;
+ break;
+ }
+
+ insn->consumedDisplacement = true;
+ return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ * displacement) for an instruction and interprets it.
+ *
+ * @param insn - The instruction whose addressing information is to be read.
+ * @return - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+ uint8_t mod, rm, reg;
+
+ dbgprintf(insn, "readModRM()");
+
+ if (insn->consumedModRM)
+ return 0;
+
+ if (consumeByte(insn, &insn->modRM))
+ return -1;
+ insn->consumedModRM = true;
+
+ mod = modFromModRM(insn->modRM);
+ rm = rmFromModRM(insn->modRM);
+ reg = regFromModRM(insn->modRM);
+
+ /*
+ * This goes by insn->registerSize to pick the correct register, which messes
+ * up if we're using (say) XMM or 8-bit register operands. That gets fixed in
+ * fixupReg().
+ */
+ switch (insn->registerSize) {
+ case 2:
+ insn->regBase = MODRM_REG_AX;
+ insn->eaRegBase = EA_REG_AX;
+ break;
+ case 4:
+ insn->regBase = MODRM_REG_EAX;
+ insn->eaRegBase = EA_REG_EAX;
+ break;
+ case 8:
+ insn->regBase = MODRM_REG_RAX;
+ insn->eaRegBase = EA_REG_RAX;
+ break;
+ }
+
+ reg |= rFromREX(insn->rexPrefix) << 3;
+ rm |= bFromREX(insn->rexPrefix) << 3;
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ rm |= xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ }
+
+ insn->reg = (Reg)(insn->regBase + reg);
+
+ switch (insn->addressSize) {
+ case 2:
+ insn->eaBaseBase = EA_BASE_BX_SI;
+
+ switch (mod) {
+ case 0x0:
+ if (rm == 0x6) {
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ } else {
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_NONE;
+ }
+ break;
+ case 0x1:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_8;
+ insn->displacementSize = 1;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x2:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x3:
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 4:
+ case 8:
+ insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+ switch (mod) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+ // In determining whether RIP-relative mode is used (rm=5),
+ // or whether a SIB byte is present (rm=4),
+ // the extension bits (REX.b and EVEX.x) are ignored.
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = (insn->addressSize == 4 ?
+ EA_BASE_sib : EA_BASE_sib64);
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ case 0x5: // RIP-relative
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_32;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ break;
+ }
+ break;
+ case 0x1:
+ insn->displacementSize = 1;
+ /* FALLTHROUGH */
+ case 0x2:
+ insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = EA_BASE_sib;
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 0x3:
+ insn->eaDisplacement = EA_DISP_NONE;
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ break;
+ }
+ break;
+ } /* switch (insn->addressSize) */
+
+ return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix) \
+ static uint8_t name(struct InternalInstruction *insn, \
+ OperandType type, \
+ uint8_t index, \
+ uint8_t *valid) { \
+ *valid = 1; \
+ switch (type) { \
+ default: \
+ debug("Unhandled register type"); \
+ *valid = 0; \
+ return 0; \
+ case TYPE_Rv: \
+ return base + index; \
+ case TYPE_R8: \
+ if (insn->rexPrefix && \
+ index >= 4 && index <= 7) { \
+ return prefix##_SPL + (index - 4); \
+ } else { \
+ return prefix##_AL + index; \
+ } \
+ case TYPE_R16: \
+ return prefix##_AX + index; \
+ case TYPE_R32: \
+ return prefix##_EAX + index; \
+ case TYPE_R64: \
+ return prefix##_RAX + index; \
+ case TYPE_XMM512: \
+ return prefix##_ZMM0 + index; \
+ case TYPE_XMM256: \
+ return prefix##_YMM0 + index; \
+ case TYPE_XMM128: \
+ case TYPE_XMM64: \
+ case TYPE_XMM32: \
+ case TYPE_XMM: \
+ return prefix##_XMM0 + index; \
+ case TYPE_VK1: \
+ case TYPE_VK2: \
+ case TYPE_VK4: \
+ case TYPE_VK8: \
+ case TYPE_VK16: \
+ case TYPE_VK32: \
+ case TYPE_VK64: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0 + index; \
+ case TYPE_MM64: \
+ return prefix##_MM0 + (index & 0x7); \
+ case TYPE_SEGMENTREG: \
+ if (index > 5) \
+ *valid = 0; \
+ return prefix##_ES + index; \
+ case TYPE_DEBUGREG: \
+ return prefix##_DR0 + index; \
+ case TYPE_CONTROLREG: \
+ return prefix##_CR0 + index; \
+ } \
+ }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ * reg or R/M field. If the operand is an XMM operand, for example, an
+ * operand would be XMM0 instead of AX, which readModRM() would otherwise
+ * misinterpret it as.
+ *
+ * @param insn - The instruction containing the operand.
+ * @param type - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t. The target is set to 1 if the
+ * field is valid for the register class; 0 if not.
+ * @return - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ * fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn - See fixup*Value().
+ * @param op - The operand specifier.
+ * @return - 0 if fixup was successful; -1 if the register returned was
+ * invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+ const struct OperandSpecifier *op) {
+ uint8_t valid;
+
+ dbgprintf(insn, "fixupReg()");
+
+ switch ((OperandEncoding)op->encoding) {
+ default:
+ debug("Expected a REG or R/M encoding in fixupReg");
+ return -1;
+ case ENCODING_VVVV:
+ insn->vvvv = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->vvvv,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_REG:
+ insn->reg = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->reg - insn->regBase,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ CASE_ENCODING_RM:
+ if (insn->eaBase >= insn->eaRegBase) {
+ insn->eaBase = (EABase)fixupRMValue(insn,
+ (OperandType)op->type,
+ insn->eaBase - insn->eaRegBase,
+ &valid);
+ if (!valid)
+ return -1;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ * instruction and interprets it appropriately given the operand width.
+ * Handles AddRegFrm instructions.
+ *
+ * @param insn - the instruction whose opcode field is to be read.
+ * @param size - The width (in bytes) of the register being specified.
+ * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ * RAX.
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+ dbgprintf(insn, "readOpcodeRegister()");
+
+ if (size == 0)
+ size = insn->registerSize;
+
+ switch (size) {
+ case 1:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ if (insn->rexPrefix &&
+ insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+ insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+ insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+ + (insn->opcodeRegister - MODRM_REG_AL - 4));
+ }
+
+ break;
+ case 2:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ break;
+ case 4:
+ insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ break;
+ case 8:
+ insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | (insn->opcode & 7)));
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ * desired operand size.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @param size - The width (in bytes) of the operand.
+ * @return - 0 if the immediate was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+ uint8_t imm8;
+ uint16_t imm16;
+ uint32_t imm32;
+ uint64_t imm64;
+
+ dbgprintf(insn, "readImmediate()");
+
+ if (insn->numImmediatesConsumed == 2) {
+ debug("Already consumed two immediates");
+ return -1;
+ }
+
+ if (size == 0)
+ size = insn->immediateSize;
+ else
+ insn->immediateSize = size;
+ insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+ switch (size) {
+ case 1:
+ if (consumeByte(insn, &imm8))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm8;
+ break;
+ case 2:
+ if (consumeUInt16(insn, &imm16))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm16;
+ break;
+ case 4:
+ if (consumeUInt32(insn, &imm32))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm32;
+ break;
+ case 8:
+ if (consumeUInt64(insn, &imm64))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm64;
+ break;
+ }
+
+ insn->numImmediatesConsumed++;
+
+ return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @return - 0 if the vvvv was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readVVVV()");
+
+ int vvvv;
+ if (insn->vectorExtensionType == TYPE_EVEX)
+ vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+ vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+ else if (insn->vectorExtensionType == TYPE_VEX_3B)
+ vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+ else if (insn->vectorExtensionType == TYPE_VEX_2B)
+ vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+ else if (insn->vectorExtensionType == TYPE_XOP)
+ vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+ else
+ return -1;
+
+ if (insn->mode != MODE_64BIT)
+ vvvv &= 0x7;
+
+ insn->vvvv = static_cast<Reg>(vvvv);
+ return 0;
+}
+
+/*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ * instruction.
+ *
+ * @param insn - The instruction whose opcode field is to be read.
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readMaskRegister()");
+
+ if (insn->vectorExtensionType != TYPE_EVEX)
+ return -1;
+
+ insn->writemask =
+ static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+ return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ * operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn - The instruction whose operands are to be read and interpreted.
+ * @return - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+ int hasVVVV, needVVVV;
+ int sawRegImm = 0;
+
+ dbgprintf(insn, "readOperands()");
+
+ /* If non-zero vvvv specified, need to make sure one of the operands
+ uses it. */
+ hasVVVV = !readVVVV(insn);
+ needVVVV = hasVVVV && (insn->vvvv != 0);
+
+ for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+ switch (Op.encoding) {
+ case ENCODING_NONE:
+ case ENCODING_SI:
+ case ENCODING_DI:
+ break;
+ case ENCODING_REG:
+ CASE_ENCODING_RM:
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ // Apply the AVX512 compressed displacement scaling factor.
+ if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+ insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+ break;
+ case ENCODING_CB:
+ case ENCODING_CW:
+ case ENCODING_CD:
+ case ENCODING_CP:
+ case ENCODING_CO:
+ case ENCODING_CT:
+ dbgprintf(insn, "We currently don't hande code-offset encodings");
+ return -1;
+ case ENCODING_IB:
+ if (sawRegImm) {
+ /* Saw a register immediate so don't read again and instead split the
+ previous immediate. FIXME: This is a hack. */
+ insn->immediates[insn->numImmediatesConsumed] =
+ insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+ ++insn->numImmediatesConsumed;
+ break;
+ }
+ if (readImmediate(insn, 1))
+ return -1;
+ if (Op.type == TYPE_XMM128 ||
+ Op.type == TYPE_XMM256)
+ sawRegImm = 1;
+ break;
+ case ENCODING_IW:
+ if (readImmediate(insn, 2))
+ return -1;
+ break;
+ case ENCODING_ID:
+ if (readImmediate(insn, 4))
+ return -1;
+ break;
+ case ENCODING_IO:
+ if (readImmediate(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Iv:
+ if (readImmediate(insn, insn->immediateSize))
+ return -1;
+ break;
+ case ENCODING_Ia:
+ if (readImmediate(insn, insn->addressSize))
+ return -1;
+ break;
+ case ENCODING_RB:
+ if (readOpcodeRegister(insn, 1))
+ return -1;
+ break;
+ case ENCODING_RW:
+ if (readOpcodeRegister(insn, 2))
+ return -1;
+ break;
+ case ENCODING_RD:
+ if (readOpcodeRegister(insn, 4))
+ return -1;
+ break;
+ case ENCODING_RO:
+ if (readOpcodeRegister(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Rv:
+ if (readOpcodeRegister(insn, 0))
+ return -1;
+ break;
+ case ENCODING_FP:
+ break;
+ case ENCODING_VVVV:
+ needVVVV = 0; /* Mark that we have found a VVVV operand. */
+ if (!hasVVVV)
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ break;
+ case ENCODING_WRITEMASK:
+ if (readMaskRegister(insn))
+ return -1;
+ break;
+ case ENCODING_DUP:
+ break;
+ default:
+ dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+ return -1;
+ }
+ }
+
+ /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+ if (needVVVV) return -1;
+
+ return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ * user.
+ *
+ * @param insn - A pointer to the instruction to be populated. Must be
+ * pre-allocated.
+ * @param reader - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ * any internal state.
+ * @param logger - If non-NULL, the function to be used to write log messages
+ * and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ * any internal state.
+ * @param startLoc - The address (in the reader's address space) of the first
+ * byte in the instruction.
+ * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ * decode the instruction in.
+ * @return - 0 if the instruction's memory could be read; nonzero if
+ * not.
+ */
+int llvm::X86Disassembler::decodeInstruction(
+ struct InternalInstruction *insn, byteReader_t reader,
+ const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+ uint64_t startLoc, DisassemblerMode mode) {
+ memset(insn, 0, sizeof(struct InternalInstruction));
+
+ insn->reader = reader;
+ insn->readerArg = readerArg;
+ insn->dlog = logger;
+ insn->dlogArg = loggerArg;
+ insn->startLocation = startLoc;
+ insn->readerCursor = startLoc;
+ insn->mode = mode;
+ insn->numImmediatesConsumed = 0;
+
+ if (readPrefixes(insn) ||
+ readOpcode(insn) ||
+ getID(insn, miiArg) ||
+ insn->instructionID == 0 ||
+ readOperands(insn))
+ return -1;
+
+ insn->operands = x86OperandSets[insn->spec->operands];
+
+ insn->length = insn->readerCursor - insn->startLocation;
+
+ dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+ startLoc, insn->readerCursor, insn->length);
+
+ if (insn->length > 15)
+ dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 0000000..28a628e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,675 @@
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+// Accessor functions for various fields of an Intel instruction
+#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM) (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM) ((modRM) & 0x7)
+#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib) (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib) ((sib) & 0x7)
+#define wFromREX(rex) (((rex) & 0x8) >> 3)
+#define rFromREX(rex) (((rex) & 0x4) >> 2)
+#define xFromREX(rex) (((rex) & 0x2) >> 1)
+#define bFromREX(rex) ((rex) & 0x1)
+
+#define rFromEVEX2of4(evex) (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex) (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex) (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex) (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex) ((evex) & 0x3)
+#define wFromEVEX3of4(evex) (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex) (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex) ((evex) & 0x3)
+#define zFromEVEX4of4(evex) (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex) (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex) (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex) (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex) (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex) ((evex) & 0x7)
+
+#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f)
+#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex) ((vex) & 0x3)
+
+#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex) ((vex) & 0x3)
+
+#define rFromXOP2of3(xop) (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop) (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop) (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop) ((xop) & 0x1f)
+#define wFromXOP3of3(xop) (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop) (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop) ((xop) & 0x3)
+
+// These enums represent Intel registers for use by the decoder.
+#define REGS_8BIT \
+ ENTRY(AL) \
+ ENTRY(CL) \
+ ENTRY(DL) \
+ ENTRY(BL) \
+ ENTRY(AH) \
+ ENTRY(CH) \
+ ENTRY(DH) \
+ ENTRY(BH) \
+ ENTRY(R8B) \
+ ENTRY(R9B) \
+ ENTRY(R10B) \
+ ENTRY(R11B) \
+ ENTRY(R12B) \
+ ENTRY(R13B) \
+ ENTRY(R14B) \
+ ENTRY(R15B) \
+ ENTRY(SPL) \
+ ENTRY(BPL) \
+ ENTRY(SIL) \
+ ENTRY(DIL)
+
+#define EA_BASES_16BIT \
+ ENTRY(BX_SI) \
+ ENTRY(BX_DI) \
+ ENTRY(BP_SI) \
+ ENTRY(BP_DI) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(BP) \
+ ENTRY(BX) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define REGS_16BIT \
+ ENTRY(AX) \
+ ENTRY(CX) \
+ ENTRY(DX) \
+ ENTRY(BX) \
+ ENTRY(SP) \
+ ENTRY(BP) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define EA_BASES_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(sib) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define REGS_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(ESP) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define EA_BASES_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(sib64) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(RSP) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_MMX \
+ ENTRY(MM0) \
+ ENTRY(MM1) \
+ ENTRY(MM2) \
+ ENTRY(MM3) \
+ ENTRY(MM4) \
+ ENTRY(MM5) \
+ ENTRY(MM6) \
+ ENTRY(MM7)
+
+#define REGS_XMM \
+ ENTRY(XMM0) \
+ ENTRY(XMM1) \
+ ENTRY(XMM2) \
+ ENTRY(XMM3) \
+ ENTRY(XMM4) \
+ ENTRY(XMM5) \
+ ENTRY(XMM6) \
+ ENTRY(XMM7) \
+ ENTRY(XMM8) \
+ ENTRY(XMM9) \
+ ENTRY(XMM10) \
+ ENTRY(XMM11) \
+ ENTRY(XMM12) \
+ ENTRY(XMM13) \
+ ENTRY(XMM14) \
+ ENTRY(XMM15) \
+ ENTRY(XMM16) \
+ ENTRY(XMM17) \
+ ENTRY(XMM18) \
+ ENTRY(XMM19) \
+ ENTRY(XMM20) \
+ ENTRY(XMM21) \
+ ENTRY(XMM22) \
+ ENTRY(XMM23) \
+ ENTRY(XMM24) \
+ ENTRY(XMM25) \
+ ENTRY(XMM26) \
+ ENTRY(XMM27) \
+ ENTRY(XMM28) \
+ ENTRY(XMM29) \
+ ENTRY(XMM30) \
+ ENTRY(XMM31)
+
+#define REGS_YMM \
+ ENTRY(YMM0) \
+ ENTRY(YMM1) \
+ ENTRY(YMM2) \
+ ENTRY(YMM3) \
+ ENTRY(YMM4) \
+ ENTRY(YMM5) \
+ ENTRY(YMM6) \
+ ENTRY(YMM7) \
+ ENTRY(YMM8) \
+ ENTRY(YMM9) \
+ ENTRY(YMM10) \
+ ENTRY(YMM11) \
+ ENTRY(YMM12) \
+ ENTRY(YMM13) \
+ ENTRY(YMM14) \
+ ENTRY(YMM15) \
+ ENTRY(YMM16) \
+ ENTRY(YMM17) \
+ ENTRY(YMM18) \
+ ENTRY(YMM19) \
+ ENTRY(YMM20) \
+ ENTRY(YMM21) \
+ ENTRY(YMM22) \
+ ENTRY(YMM23) \
+ ENTRY(YMM24) \
+ ENTRY(YMM25) \
+ ENTRY(YMM26) \
+ ENTRY(YMM27) \
+ ENTRY(YMM28) \
+ ENTRY(YMM29) \
+ ENTRY(YMM30) \
+ ENTRY(YMM31)
+
+#define REGS_ZMM \
+ ENTRY(ZMM0) \
+ ENTRY(ZMM1) \
+ ENTRY(ZMM2) \
+ ENTRY(ZMM3) \
+ ENTRY(ZMM4) \
+ ENTRY(ZMM5) \
+ ENTRY(ZMM6) \
+ ENTRY(ZMM7) \
+ ENTRY(ZMM8) \
+ ENTRY(ZMM9) \
+ ENTRY(ZMM10) \
+ ENTRY(ZMM11) \
+ ENTRY(ZMM12) \
+ ENTRY(ZMM13) \
+ ENTRY(ZMM14) \
+ ENTRY(ZMM15) \
+ ENTRY(ZMM16) \
+ ENTRY(ZMM17) \
+ ENTRY(ZMM18) \
+ ENTRY(ZMM19) \
+ ENTRY(ZMM20) \
+ ENTRY(ZMM21) \
+ ENTRY(ZMM22) \
+ ENTRY(ZMM23) \
+ ENTRY(ZMM24) \
+ ENTRY(ZMM25) \
+ ENTRY(ZMM26) \
+ ENTRY(ZMM27) \
+ ENTRY(ZMM28) \
+ ENTRY(ZMM29) \
+ ENTRY(ZMM30) \
+ ENTRY(ZMM31)
+
+#define REGS_MASKS \
+ ENTRY(K0) \
+ ENTRY(K1) \
+ ENTRY(K2) \
+ ENTRY(K3) \
+ ENTRY(K4) \
+ ENTRY(K5) \
+ ENTRY(K6) \
+ ENTRY(K7)
+
+#define REGS_SEGMENT \
+ ENTRY(ES) \
+ ENTRY(CS) \
+ ENTRY(SS) \
+ ENTRY(DS) \
+ ENTRY(FS) \
+ ENTRY(GS)
+
+#define REGS_DEBUG \
+ ENTRY(DR0) \
+ ENTRY(DR1) \
+ ENTRY(DR2) \
+ ENTRY(DR3) \
+ ENTRY(DR4) \
+ ENTRY(DR5) \
+ ENTRY(DR6) \
+ ENTRY(DR7) \
+ ENTRY(DR8) \
+ ENTRY(DR9) \
+ ENTRY(DR10) \
+ ENTRY(DR11) \
+ ENTRY(DR12) \
+ ENTRY(DR13) \
+ ENTRY(DR14) \
+ ENTRY(DR15)
+
+#define REGS_CONTROL \
+ ENTRY(CR0) \
+ ENTRY(CR1) \
+ ENTRY(CR2) \
+ ENTRY(CR3) \
+ ENTRY(CR4) \
+ ENTRY(CR5) \
+ ENTRY(CR6) \
+ ENTRY(CR7) \
+ ENTRY(CR8) \
+ ENTRY(CR9) \
+ ENTRY(CR10) \
+ ENTRY(CR11) \
+ ENTRY(CR12) \
+ ENTRY(CR13) \
+ ENTRY(CR14) \
+ ENTRY(CR15)
+
+#define ALL_EA_BASES \
+ EA_BASES_16BIT \
+ EA_BASES_32BIT \
+ EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+ REGS_32BIT \
+ REGS_64BIT
+
+#define ALL_REGS \
+ REGS_8BIT \
+ REGS_16BIT \
+ REGS_32BIT \
+ REGS_64BIT \
+ REGS_MMX \
+ REGS_XMM \
+ REGS_YMM \
+ REGS_ZMM \
+ REGS_MASKS \
+ REGS_SEGMENT \
+ REGS_DEBUG \
+ REGS_CONTROL \
+ ENTRY(RIP)
+
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
+ EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ EA_max
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
+ SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+ ALL_EA_BASES
+ REGS_XMM
+ REGS_YMM
+ REGS_ZMM
+#undef ENTRY
+ SIB_INDEX_max
+};
+
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
+ SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+ ALL_SIB_BASES
+#undef ENTRY
+ SIB_BASE_max
+};
+
+/// \brief Possible displacement types for effective-address computations.
+typedef enum {
+ EA_DISP_NONE,
+ EA_DISP_8,
+ EA_DISP_16,
+ EA_DISP_32
+} EADisplacement;
+
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
+#define ENTRY(x) MODRM_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ MODRM_REG_max
+};
+
+/// \brief All possible segment overrides.
+enum SegmentOverride {
+ SEG_OVERRIDE_NONE,
+ SEG_OVERRIDE_CS,
+ SEG_OVERRIDE_SS,
+ SEG_OVERRIDE_DS,
+ SEG_OVERRIDE_ES,
+ SEG_OVERRIDE_FS,
+ SEG_OVERRIDE_GS,
+ SEG_OVERRIDE_max
+};
+
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
+ VEX_LOB_0F = 0x1,
+ VEX_LOB_0F38 = 0x2,
+ VEX_LOB_0F3A = 0x3
+};
+
+enum XOPMapSelect {
+ XOP_MAP_SELECT_8 = 0x8,
+ XOP_MAP_SELECT_9 = 0x9,
+ XOP_MAP_SELECT_A = 0xA
+};
+
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
+ VEX_PREFIX_NONE = 0x0,
+ VEX_PREFIX_66 = 0x1,
+ VEX_PREFIX_F3 = 0x2,
+ VEX_PREFIX_F2 = 0x3
+};
+
+enum VectorExtensionType {
+ TYPE_NO_VEX_XOP = 0x0,
+ TYPE_VEX_2B = 0x1,
+ TYPE_VEX_3B = 0x2,
+ TYPE_EVEX = 0x3,
+ TYPE_XOP = 0x4
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg A baton that the consumer can associate with any internal
+/// state that it needs.
+/// \param byte A pointer to a single byte in memory that should be set to
+/// contain the value at address.
+/// \param address The address in the instruction's address space that should
+/// be read from.
+/// \return -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+/// state that it needs.
+/// \param log A string that contains the message. Will be reused after
+/// the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+ uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
+struct InternalInstruction {
+ // Reader interface (C)
+ byteReader_t reader;
+ // Opaque value passed to the reader
+ const void* readerArg;
+ // The address of the next byte to read via the reader
+ uint64_t readerCursor;
+
+ // Logger interface (C)
+ dlog_t dlog;
+ // Opaque value passed to the logger
+ void* dlogArg;
+
+ // General instruction information
+
+ // The mode to disassemble for (64-bit, protected, real)
+ DisassemblerMode mode;
+ // The start of the instruction, usable with the reader
+ uint64_t startLocation;
+ // The length of the instruction, in bytes
+ size_t length;
+
+ // Prefix state
+
+ // 1 if the prefix byte corresponding to the entry is present; 0 if not
+ uint8_t prefixPresent[0x100];
+ // contains the location (for use with the reader) of the prefix byte
+ uint64_t prefixLocations[0x100];
+ // The value of the vector extension prefix(EVEX/VEX/XOP), if present
+ uint8_t vectorExtensionPrefix[4];
+ // The type of the vector extension prefix
+ VectorExtensionType vectorExtensionType;
+ // The value of the REX prefix, if present
+ uint8_t rexPrefix;
+ // The location where a mandatory prefix would have to be (i.e., right before
+ // the opcode, or right before the REX prefix if one is present).
+ uint64_t necessaryPrefixLocation;
+ // The segment override type
+ SegmentOverride segmentOverride;
+ // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+ bool xAcquireRelease;
+
+ // Sizes of various critical pieces of data, in bytes
+ uint8_t registerSize;
+ uint8_t addressSize;
+ uint8_t displacementSize;
+ uint8_t immediateSize;
+
+ // Offsets from the start of the instruction to the pieces of data, which is
+ // needed to find relocation entries for adding symbolic operands.
+ uint8_t displacementOffset;
+ uint8_t immediateOffset;
+
+ // opcode state
+
+ // The last byte of the opcode, not counting any ModR/M extension
+ uint8_t opcode;
+
+ // decode state
+
+ // The type of opcode, used for indexing into the array of decode tables
+ OpcodeType opcodeType;
+ // The instruction ID, extracted from the decode table
+ uint16_t instructionID;
+ // The specifier for the instruction, from the instruction info table
+ const InstructionSpecifier *spec;
+
+ // state for additional bytes, consumed during operand decode. Pattern:
+ // consumed___ indicates that the byte was already consumed and does not
+ // need to be consumed again.
+
+ // The VEX.vvvv field, which contains a third register operand for some AVX
+ // instructions.
+ Reg vvvv;
+
+ // The writemask for AVX-512 instructions which is contained in EVEX.aaa
+ Reg writemask;
+
+ // The ModR/M byte, which contains most register operands and some portion of
+ // all memory operands.
+ bool consumedModRM;
+ uint8_t modRM;
+
+ // The SIB byte, used for more complex 32- or 64-bit memory operands
+ bool consumedSIB;
+ uint8_t sib;
+
+ // The displacement, used for memory operands
+ bool consumedDisplacement;
+ int32_t displacement;
+
+ // Immediates. There can be two in some cases
+ uint8_t numImmediatesConsumed;
+ uint8_t numImmediatesTranslated;
+ uint64_t immediates[2];
+
+ // A register or immediate operand encoded into the opcode
+ Reg opcodeRegister;
+
+ // Portions of the ModR/M byte
+
+ // These fields determine the allowable values for the ModR/M fields, which
+ // depend on operand and address widths.
+ EABase eaBaseBase;
+ EABase eaRegBase;
+ Reg regBase;
+
+ // The Mod and R/M fields can encode a base for an effective address, or a
+ // register. These are separated into two fields here.
+ EABase eaBase;
+ EADisplacement eaDisplacement;
+ // The reg field always encodes a register
+ Reg reg;
+
+ // SIB state
+ SIBIndex sibIndex;
+ uint8_t sibScale;
+ SIBBase sibBase;
+
+ ArrayRef<OperandSpecifier> operands;
+};
+
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn The buffer to store the instruction in. Allocated by the
+/// consumer.
+/// \param reader The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+/// specific to the consumer. May be NULL.
+/// \param logger The dlog_t to be used in printing status messages from the
+/// disassembler. May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+/// specific to the logger. May be NULL.
+/// \param startLoc The address (in the reader's address space) of the first
+/// byte in the instruction.
+/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
+ byteReader_t reader,
+ const void *readerArg,
+ dlog_t logger,
+ void *loggerArg,
+ const void *miiArg,
+ uint64_t startLoc,
+ DisassemblerMode mode);
+
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s The message to print.
+void Debug(const char *file, unsigned line, const char *s);
+
+const char *GetInstrName(unsigned Opcode, const void *mii);
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 0000000..301db72
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,503 @@
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+// generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM x86DisassemblerContexts
+#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes
+#define XOP8_MAP_SYM x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM x86DisassemblerXOPAOpcodes
+
+#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR "x86DisassemblerContexts"
+#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes"
+#define XOP8_MAP_STR "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes"
+
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly. Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+#define ATTRIBUTE_BITS \
+ ENUM_ENTRY(ATTR_NONE, 0x00) \
+ ENUM_ENTRY(ATTR_64BIT, (0x1 << 0)) \
+ ENUM_ENTRY(ATTR_XS, (0x1 << 1)) \
+ ENUM_ENTRY(ATTR_XD, (0x1 << 2)) \
+ ENUM_ENTRY(ATTR_REXW, (0x1 << 3)) \
+ ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4)) \
+ ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5)) \
+ ENUM_ENTRY(ATTR_VEX, (0x1 << 6)) \
+ ENUM_ENTRY(ATTR_VEXL, (0x1 << 7)) \
+ ENUM_ENTRY(ATTR_EVEX, (0x1 << 8)) \
+ ENUM_ENTRY(ATTR_EVEXL, (0x1 << 9)) \
+ ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10)) \
+ ENUM_ENTRY(ATTR_EVEXK, (0x1 << 11)) \
+ ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12)) \
+ ENUM_ENTRY(ATTR_EVEXB, (0x1 << 13))
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+ ATTRIBUTE_BITS
+ ATTR_max
+};
+#undef ENUM_ENTRY
+
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
+
+// Class name Rank Rationale for rank assignment
+#define INSTRUCTION_CONTEXTS \
+ ENUM_ENTRY(IC, 0, "says nothing about the instruction") \
+ ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \
+ "64-bit mode but no more") \
+ ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_OPSIZE_ADSIZE, 4, "requires ADSIZE and OPSIZE prefixes") \
+ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_64BIT_REXW, 5, "requires a REX.W prefix, so operands "\
+ "change width; overrides IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_REXW_ADSIZE, 6, "requires a REX.W prefix and 0x67 " \
+ "prefix") \
+ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \
+ ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/" \
+ "IC_ADSIZE") \
+ ENUM_ENTRY(IC_64BIT_XD, 6, "XD instructions are SSE; REX.W is " \
+ "secondary") \
+ ENUM_ENTRY(IC_64BIT_XS, 6, "Just as meaningful as IC_64BIT_XD") \
+ ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_REXW_XS, 7, "OPSIZE could mean a different " \
+ "opcode") \
+ ENUM_ENTRY(IC_64BIT_REXW_XD, 7, "Just as meaningful as " \
+ "IC_64BIT_REXW_XS") \
+ ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 8, "The Dynamic Duo! Prefer over all " \
+ "else because this changes most " \
+ "operands' meaning") \
+ ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \
+ ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \
+ ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \
+ ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \
+ ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \
+ ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \
+ ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \
+ ENUM_ENTRY(IC_VEX_L_W, 4, "requires VEX, L and W") \
+ ENUM_ENTRY(IC_VEX_L_W_XS, 5, "requires VEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_XD, 5, "requires VEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \
+ ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_KZ_B, 1, "requires EVEX_B and EVEX_KZ prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_KZ, 1, "requires an EVEX_KZ prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_KZ, 2, "requires EVEX_KZ and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_KZ, 2, "requires EVEX_KZ and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_KZ, 2, "requires EVEX_KZ and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_KZ, 3, "requires EVEX_KZ and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_KZ, 4, "requires EVEX_KZ, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_KZ, 4, "requires EVEX_KZ, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ, 4, "requires EVEX_KZ, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_KZ, 3, "requires EVEX_KZ and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_KZ, 4, "requires EVEX_KZ and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_KZ, 4, "requires EVEX_KZ and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ, 4, "requires EVEX_KZ, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_KZ, 3, "requires EVEX_KZ, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_KZ, 4, "requires EVEX_KZ, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_KZ, 4, "requires EVEX_KZ, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_KZ, 3, "requires EVEX_KZ and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_KZ, 4, "requires EVEX_KZ and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_KZ, 4, "requires EVEX_KZ and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize")
+
+#define ENUM_ENTRY(n, r, d) n,
+enum InstructionContext {
+ INSTRUCTION_CONTEXTS
+ IC_max
+};
+#undef ENUM_ENTRY
+
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
+ ONEBYTE = 0,
+ TWOBYTE = 1,
+ THREEBYTE_38 = 2,
+ THREEBYTE_3A = 3,
+ XOP8_MAP = 4,
+ XOP9_MAP = 5,
+ XOPA_MAP = 6
+};
+
+// The following structs are used for the hierarchical decode table. After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode. Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
+typedef uint16_t InstrUID;
+
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+// instruction is the same.
+// MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+// corresponds to one instruction; otherwise, it corresponds to
+// a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+// divided by 8 is used to select instruction; otherwise, each
+// value of the ModR/M byte could correspond to a different
+// instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+// corresponds to instructions that use reg field as opcode
+// MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
+// to a different instruction.
+#define MODRMTYPES \
+ ENUM_ENTRY(MODRM_ONEENTRY) \
+ ENUM_ENTRY(MODRM_SPLITRM) \
+ ENUM_ENTRY(MODRM_SPLITMISC) \
+ ENUM_ENTRY(MODRM_SPLITREG) \
+ ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+enum ModRMDecisionType {
+ MODRMTYPES
+ MODRM_max
+};
+#undef ENUM_ENTRY
+
+#define CASE_ENCODING_RM \
+ case ENCODING_RM: \
+ case ENCODING_RM_CD2: \
+ case ENCODING_RM_CD4: \
+ case ENCODING_RM_CD8: \
+ case ENCODING_RM_CD16: \
+ case ENCODING_RM_CD32: \
+ case ENCODING_RM_CD64
+
+// Physical encodings of instruction operands.
+#define ENCODINGS \
+ ENUM_ENTRY(ENCODING_NONE, "") \
+ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2") \
+ ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4") \
+ ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8") \
+ ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \
+ ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \
+ ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \
+ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \
+ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \
+ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \
+ ENUM_ENTRY(ENCODING_CW, "2-byte") \
+ ENUM_ENTRY(ENCODING_CD, "4-byte") \
+ ENUM_ENTRY(ENCODING_CP, "6-byte") \
+ ENUM_ENTRY(ENCODING_CO, "8-byte") \
+ ENUM_ENTRY(ENCODING_CT, "10-byte") \
+ ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \
+ ENUM_ENTRY(ENCODING_IW, "2-byte") \
+ ENUM_ENTRY(ENCODING_ID, "4-byte") \
+ ENUM_ENTRY(ENCODING_IO, "8-byte") \
+ ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \
+ "the opcode byte") \
+ ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \
+ ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \
+ ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \
+ ENUM_ENTRY(ENCODING_FP, "Position on floating-point stack in ModR/M " \
+ "byte.") \
+ \
+ ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \
+ ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \
+ ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \
+ "opcode byte") \
+ ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \
+ "in type") \
+ ENUM_ENTRY(ENCODING_SI, "Source index; encoded in OpSize/Adsize prefix") \
+ ENUM_ENTRY(ENCODING_DI, "Destination index; encoded in prefixes")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandEncoding {
+ ENCODINGS
+ ENCODING_max
+};
+#undef ENUM_ENTRY
+
+// Semantic interpretations of instruction operands.
+#define TYPES \
+ ENUM_ENTRY(TYPE_NONE, "") \
+ ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \
+ ENUM_ENTRY(TYPE_REL16, "2-byte") \
+ ENUM_ENTRY(TYPE_REL32, "4-byte") \
+ ENUM_ENTRY(TYPE_REL64, "8-byte") \
+ ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_R8, "1-byte register operand") \
+ ENUM_ENTRY(TYPE_R16, "2-byte") \
+ ENUM_ENTRY(TYPE_R32, "4-byte") \
+ ENUM_ENTRY(TYPE_R64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \
+ ENUM_ENTRY(TYPE_IMM16, "2-byte") \
+ ENUM_ENTRY(TYPE_IMM32, "4-byte") \
+ ENUM_ENTRY(TYPE_IMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \
+ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \
+ ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \
+ ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \
+ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \
+ ENUM_ENTRY(TYPE_RM16, "2-byte") \
+ ENUM_ENTRY(TYPE_RM32, "4-byte") \
+ ENUM_ENTRY(TYPE_RM64, "8-byte") \
+ ENUM_ENTRY(TYPE_M, "Memory operand") \
+ ENUM_ENTRY(TYPE_M8, "1-byte") \
+ ENUM_ENTRY(TYPE_M16, "2-byte") \
+ ENUM_ENTRY(TYPE_M32, "4-byte") \
+ ENUM_ENTRY(TYPE_M64, "8-byte") \
+ ENUM_ENTRY(TYPE_LEA, "Effective address") \
+ ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \
+ ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \
+ ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_M1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_M1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_SRCIDX8, "1-byte memory at source index") \
+ ENUM_ENTRY(TYPE_SRCIDX16, "2-byte memory at source index") \
+ ENUM_ENTRY(TYPE_SRCIDX32, "4-byte memory at source index") \
+ ENUM_ENTRY(TYPE_SRCIDX64, "8-byte memory at source index") \
+ ENUM_ENTRY(TYPE_DSTIDX8, "1-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_DSTIDX16, "2-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_DSTIDX32, "4-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_DSTIDX64, "8-byte memory at destination index") \
+ ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \
+ "base)") \
+ ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \
+ ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \
+ ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \
+ ENUM_ENTRY(TYPE_SREG, "Byte with single bit set: 0 = ES, 1 = CS, " \
+ "2 = SS, 3 = DS, 4 = FS, 5 = GS") \
+ ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \
+ ENUM_ENTRY(TYPE_M64FP, "64-bit") \
+ ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \
+ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \
+ ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \
+ ENUM_ENTRY(TYPE_XMM, "XMM register operand") \
+ ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \
+ ENUM_ENTRY(TYPE_XMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_XMM128, "16-byte") \
+ ENUM_ENTRY(TYPE_XMM256, "32-byte") \
+ ENUM_ENTRY(TYPE_XMM512, "64-byte") \
+ ENUM_ENTRY(TYPE_VK1, "1-bit") \
+ ENUM_ENTRY(TYPE_VK2, "2-bit") \
+ ENUM_ENTRY(TYPE_VK4, "4-bit") \
+ ENUM_ENTRY(TYPE_VK8, "8-bit") \
+ ENUM_ENTRY(TYPE_VK16, "16-bit") \
+ ENUM_ENTRY(TYPE_VK32, "32-bit") \
+ ENUM_ENTRY(TYPE_VK64, "64-bit") \
+ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \
+ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
+ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
+ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
+ ENUM_ENTRY(TYPE_BNDR, "MPX bounds register") \
+ \
+ ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \
+ ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \
+ ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \
+ ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \
+ ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \
+ ENUM_ENTRY(TYPE_DUP1, "operand 1") \
+ ENUM_ENTRY(TYPE_DUP2, "operand 2") \
+ ENUM_ENTRY(TYPE_DUP3, "operand 3") \
+ ENUM_ENTRY(TYPE_DUP4, "operand 4") \
+ ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandType {
+ TYPES
+ TYPE_max
+};
+#undef ENUM_ENTRY
+
+/// \brief The specification for how to extract and interpret one operand.
+struct OperandSpecifier {
+ uint8_t encoding;
+ uint8_t type;
+};
+
+static const unsigned X86_MAX_OPERANDS = 6;
+
+/// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
+ MODE_16BIT,
+ MODE_32BIT,
+ MODE_64BIT
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 0000000..b4c0bc4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,289 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <map>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ HasCustomInstComment =
+ EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+
+ if (TSFlags & X86II::LOCK)
+ OS << "\tlock\t";
+
+ // Output CALLpcrel32 as "callq" in 64-bit mode.
+ // In Intel annotation it's always emitted as "call".
+ //
+ // TODO: Probably this hack should be redesigned via InstAlias in
+ // InstrInfo.td as soon as Requires clause is supported properly
+ // for InstAlias.
+ if (MI->getOpcode() == X86::CALLpcrel32 &&
+ (STI.getFeatureBits()[X86::Mode64Bit])) {
+ OS << "\tcallq\t";
+ printPCRelImm(MI, 0, OS);
+ }
+ // Try to print any aliases first.
+ else if (!printAliasInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid xopcc argument!");
+ case 0: O << "lt"; break;
+ case 1: O << "le"; break;
+ case 2: O << "gt"; break;
+ case 3: O << "ge"; break;
+ case 4: O << "eq"; break;
+ case 5: O << "neq"; break;
+ case 6: O << "false"; break;
+ case 7: O << "true"; break;
+ }
+}
+
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+ switch (Imm) {
+ case 0: O << "{rn-sae}"; break;
+ case 1: O << "{rd-sae}"; break;
+ case 2: O << "{ru-sae}"; break;
+ case 3: O << "{rz-sae}"; break;
+ }
+}
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). These
+/// print slightly differently than normal immediates. For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ } else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ // Print X86 immediates as signed values.
+ O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm())
+ << markup(">");
+
+ // If there are no instruction-specific comments, add a comment clarifying
+ // the hex value of the immediate operand when it isn't in the range
+ // [-256,255].
+ if (CommentStream && !HasCustomInstComment &&
+ (Op.getImm() > 255 || Op.getImm() < -256))
+ *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
+
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << markup("<imm:") << '$';
+ Op.getExpr()->print(O, &MAI);
+ O << markup(">");
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+ const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op + X86::AddrSegmentReg, O);
+ O << ':';
+ }
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << formatImm(DispVal);
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op + X86::AddrBaseReg, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op + X86::AddrIndexReg, O);
+ unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1) {
+ O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+ << markup(">");
+ }
+ }
+ O << ')';
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op + 1, O);
+ O << ':';
+ }
+
+ O << "(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ O << "%es:(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+ const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op + 1, O);
+ O << ':';
+ }
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+ << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..bbb3090
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,142 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86ATTInstPrinter final : public MCInstPrinter {
+public:
+ X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &OS);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+ void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+
+private:
+ bool HasCustomInstComment;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 0000000..73f654c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,820 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static unsigned getVectorRegSize(unsigned RegNo) {
+ if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+ return 512;
+ if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+ return 256;
+ if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+ return 128;
+ if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+ return 64;
+
+ llvm_unreachable("Unknown vector reg!");
+}
+
+static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
+ unsigned OperandIndex) {
+ unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+ return MVT::getVectorVT(ScalarVT,
+ getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+}
+
+/// \brief Extracts the src/dst types for a given zero extension instruction.
+/// \note While the number of elements in DstVT type correct, the
+/// number in the SrcVT type is expanded to fill the src xmm register and the
+/// upper elements may not be included in the dst xmm/ymm register.
+static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown zero extension instruction");
+ // i8 zero extension
+ case X86::PMOVZXBWrm:
+ case X86::PMOVZXBWrr:
+ case X86::VPMOVZXBWrm:
+ case X86::VPMOVZXBWrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v8i16;
+ break;
+ case X86::VPMOVZXBWYrm:
+ case X86::VPMOVZXBWYrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v16i16;
+ break;
+ case X86::PMOVZXBDrm:
+ case X86::PMOVZXBDrr:
+ case X86::VPMOVZXBDrm:
+ case X86::VPMOVZXBDrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v4i32;
+ break;
+ case X86::VPMOVZXBDYrm:
+ case X86::VPMOVZXBDYrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v8i32;
+ break;
+ case X86::PMOVZXBQrm:
+ case X86::PMOVZXBQrr:
+ case X86::VPMOVZXBQrm:
+ case X86::VPMOVZXBQrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v2i64;
+ break;
+ case X86::VPMOVZXBQYrm:
+ case X86::VPMOVZXBQYrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v4i64;
+ break;
+ // i16 zero extension
+ case X86::PMOVZXWDrm:
+ case X86::PMOVZXWDrr:
+ case X86::VPMOVZXWDrm:
+ case X86::VPMOVZXWDrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v4i32;
+ break;
+ case X86::VPMOVZXWDYrm:
+ case X86::VPMOVZXWDYrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v8i32;
+ break;
+ case X86::PMOVZXWQrm:
+ case X86::PMOVZXWQrr:
+ case X86::VPMOVZXWQrm:
+ case X86::VPMOVZXWQrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v2i64;
+ break;
+ case X86::VPMOVZXWQYrm:
+ case X86::VPMOVZXWQYrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v4i64;
+ break;
+ // i32 zero extension
+ case X86::PMOVZXDQrm:
+ case X86::PMOVZXDQrr:
+ case X86::VPMOVZXDQrm:
+ case X86::VPMOVZXDQrr:
+ SrcVT = MVT::v4i32;
+ DstVT = MVT::v2i64;
+ break;
+ case X86::VPMOVZXDQYrm:
+ case X86::VPMOVZXDQYrr:
+ SrcVT = MVT::v4i32;
+ DstVT = MVT::v4i64;
+ break;
+ }
+}
+
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src: \
+ case X86::V##Inst##Suffix##src##k: \
+ case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_SSE_INS_COMMON(Inst, src) \
+ case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src:
+
+#define CASE_MOVDUP(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src) \
+
+#define CASE_UNPCK(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src) \
+
+#define CASE_SHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) \
+ CASE_AVX_INS_COMMON(Inst, , r##src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src##i) \
+ CASE_SSE_INS_COMMON(Inst, r##src##i) \
+
+#define CASE_VPERM(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, src##i) \
+ CASE_AVX_INS_COMMON(Inst, , src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, src##i) \
+
+#define CASE_VSHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \
+
+/// \brief Extracts the types and if it has memory operand for a given
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
+static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
+ HasMemOp = false;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown VSHUF64x2 family instructions.");
+ break;
+ CASE_VSHUF(64X2, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF(64X2, r)
+ VT = getRegOperandVectorVT(MI, MVT::i64, 0);
+ break;
+ CASE_VSHUF(32X4, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF(32X4, r)
+ VT = getRegOperandVectorVT(MI, MVT::i32, 0);
+ break;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired. This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned)) {
+ // If this is a shuffle operation, the switch should fill in this state.
+ SmallVector<int, 8> ShuffleMask;
+ const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+
+ switch (MI->getOpcode()) {
+ default:
+ // Not an instruction for which we can decode comments.
+ return false;
+
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::BLENDPDrmi:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDYrmi:
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPSYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::BLENDPSrmi:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSYrmi:
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrri:
+ case X86::VPBLENDWYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PBLENDWrmi:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWYrmi:
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPBLENDDrri:
+ case X86::VPBLENDDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDYrmi:
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::INSERTPSrm:
+ case X86::VINSERTPSrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MOVLHPSrr:
+ case X86::VMOVLHPSrr:
+ case X86::VMOVLHPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVLHPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVHLPSMask(2, ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVSLDUP, r)
+ Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+ // FALL THROUGH.
+ CASE_MOVDUP(MOVSLDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVSHDUP, r)
+ Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+ // FALL THROUGH.
+ CASE_MOVDUP(MOVSHDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVDDUP, r)
+ Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+ // FALL THROUGH.
+ CASE_MOVDUP(MOVDDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ break;
+
+ case X86::PSLLDQri:
+ case X86::VPSLLDQri:
+ case X86::VPSLLDQYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSRLDQri:
+ case X86::VPSRLDQri:
+ case X86::VPSRLDQYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PALIGNR128rr:
+ case X86::VPALIGNR128rr:
+ case X86::VPALIGNR256rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PALIGNR128rm:
+ case X86::VPALIGNR128rm:
+ case X86::VPALIGNR256rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSHUFDri:
+ case X86::VPSHUFDri:
+ case X86::VPSHUFDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFDmi:
+ case X86::VPSHUFDmi:
+ case X86::VPSHUFDYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSHUFHWri:
+ case X86::VPSHUFHWri:
+ case X86::VPSHUFHWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFHWmi:
+ case X86::VPSHUFHWmi:
+ case X86::VPSHUFHWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSHUFLWri:
+ case X86::VPSHUFLWri:
+ case X86::VPSHUFLWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFLWmi:
+ case X86::VPSHUFLWmi:
+ case X86::VPSHUFLWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MMX_PSHUFWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MMX_PSHUFWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFMask(MVT::v4i16,
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSWAPDrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSWAPDrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSWAPMask(MVT::v2i32, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHBW, r)
+ case X86::MMX_PUNPCKHBWirr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKHBW, m)
+ case X86::MMX_PUNPCKHBWirm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHWD, r)
+ case X86::MMX_PUNPCKHWDirr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKHWD, m)
+ case X86::MMX_PUNPCKHWDirm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHDQ, r)
+ case X86::MMX_PUNPCKHDQirr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKHDQ, m)
+ case X86::MMX_PUNPCKHDQirm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHQDQ, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKHQDQ, m)
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLBW, r)
+ case X86::MMX_PUNPCKLBWirr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKLBW, m)
+ case X86::MMX_PUNPCKLBWirm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLWD, r)
+ case X86::MMX_PUNPCKLWDirr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKLWD, m)
+ case X86::MMX_PUNPCKLWDirm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLDQ, r)
+ case X86::MMX_PUNPCKLDQirr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKLDQ, m)
+ case X86::MMX_PUNPCKLDQirm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLQDQ, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(PUNPCKLQDQ, m)
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+ break;
+
+ CASE_SHUF(SHUFPD, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_SHUF(SHUFPD, m)
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_SHUF(SHUFPS, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_SHUF(SHUFPS, m)
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VSHUF(64X2, r)
+ CASE_VSHUF(64X2, m)
+ CASE_VSHUF(32X4, r)
+ CASE_VSHUF(32X4, m) {
+ MVT VT;
+ bool HasMemOp;
+ unsigned NumOp = MI->getNumOperands();
+ getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
+ decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (HasMemOp) {
+ assert((NumOp >= 8) && "Expected at least 8 operands!");
+ Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
+ } else {
+ assert((NumOp >= 4) && "Expected at least 4 operands!");
+ Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
+ Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
+ }
+ break;
+ }
+
+ CASE_UNPCK(UNPCKLPD, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(UNPCKLPD, m)
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKLPS, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(UNPCKLPS, m)
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKHPD, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(UNPCKHPD, m)
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKHPS, r)
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ CASE_UNPCK(UNPCKHPS, m)
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERM(PERMILPS, r)
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ CASE_VPERM(PERMILPS, m)
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERM(PERMILPD, r)
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ CASE_VPERM(PERMILPD, m)
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPERM2F128rm:
+ case X86::VPERM2I128rm:
+ // For instruction comments purpose, assume the 256-bit vector is v4i64.
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeVPERM2X128Mask(MVT::v4i64,
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPERMQYri:
+ case X86::VPERMPDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMQYmi:
+ case X86::VPERMPDYmi:
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodeVPERMMask(MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSDrr:
+ case X86::VMOVSDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVSDrm:
+ case X86::VMOVSDrm:
+ DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSSrr:
+ case X86::VMOVSSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVSSrm:
+ case X86::VMOVSSrm:
+ DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVPQI2QIrr:
+ case X86::MOVZPQILo2PQIrr:
+ case X86::VMOVPQI2QIrr:
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVZPQILo2PQIZrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVQI2PQIrm:
+ case X86::MOVZQI2PQIrm:
+ case X86::MOVZPQILo2PQIrm:
+ case X86::VMOVQI2PQIrm:
+ case X86::VMOVZQI2PQIrm:
+ case X86::VMOVZPQILo2PQIrm:
+ case X86::VMOVZPQILo2PQIZrm:
+ DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVDI2PDIrm:
+ case X86::VMOVDI2PDIrm:
+ DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::EXTRQI:
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isImm())
+ DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+ MI->getOperand(3).getImm(),
+ ShuffleMask);
+
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ case X86::INSERTQI:
+ if (MI->getOperand(3).isImm() &&
+ MI->getOperand(4).isImm())
+ DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+ MI->getOperand(4).getImm(),
+ ShuffleMask);
+
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ break;
+
+ case X86::PMOVZXBWrr:
+ case X86::PMOVZXBDrr:
+ case X86::PMOVZXBQrr:
+ case X86::PMOVZXWDrr:
+ case X86::PMOVZXWQrr:
+ case X86::PMOVZXDQrr:
+ case X86::VPMOVZXBWrr:
+ case X86::VPMOVZXBDrr:
+ case X86::VPMOVZXBQrr:
+ case X86::VPMOVZXWDrr:
+ case X86::VPMOVZXWQrr:
+ case X86::VPMOVZXDQrr:
+ case X86::VPMOVZXBWYrr:
+ case X86::VPMOVZXBDYrr:
+ case X86::VPMOVZXBQYrr:
+ case X86::VPMOVZXWDYrr:
+ case X86::VPMOVZXWQYrr:
+ case X86::VPMOVZXDQYrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PMOVZXBWrm:
+ case X86::PMOVZXBDrm:
+ case X86::PMOVZXBQrm:
+ case X86::PMOVZXWDrm:
+ case X86::PMOVZXWQrm:
+ case X86::PMOVZXDQrm:
+ case X86::VPMOVZXBWrm:
+ case X86::VPMOVZXBDrm:
+ case X86::VPMOVZXBQrm:
+ case X86::VPMOVZXWDrm:
+ case X86::VPMOVZXWQrm:
+ case X86::VPMOVZXDQrm:
+ case X86::VPMOVZXBWYrm:
+ case X86::VPMOVZXBDYrm:
+ case X86::VPMOVZXBQYrm:
+ case X86::VPMOVZXWDYrm:
+ case X86::VPMOVZXWQYrm:
+ case X86::VPMOVZXDQYrm: {
+ MVT SrcVT, DstVT;
+ getZeroExtensionTypes(MI, SrcVT, DstVT);
+ DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ } break;
+ }
+
+ // The only comments we decode are shuffles, so give up if we were unable to
+ // decode a shuffle mask.
+ if (ShuffleMask.empty())
+ return false;
+
+ if (!DestName) DestName = Src1Name;
+ OS << (DestName ? DestName : "mem") << " = ";
+
+ // If the two sources are the same, canonicalize the input elements to be
+ // from the first src so that we get larger element spans.
+ if (Src1Name == Src2Name) {
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+ ShuffleMask[i] >= (int)e) // From second mask.
+ ShuffleMask[i] -= e;
+ }
+ }
+
+ // The shuffle mask specifies which elements of the src1/src2 fill in the
+ // destination, with a few sentinel values. Loop through and print them
+ // out.
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ OS << ',';
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ OS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+ const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+ OS << (SrcName ? SrcName : "mem") << '[';
+ bool IsFirst = true;
+ while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+ (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+ if (!IsFirst)
+ OS << ',';
+ else
+ IsFirst = false;
+ if (ShuffleMask[i] == SM_SentinelUndef)
+ OS << "u";
+ else
+ OS << ShuffleMask[i] % ShuffleMask.size();
+ ++i;
+ }
+ OS << ']';
+ --i; // For loop increments element #.
+ }
+ //MI->print(OS, 0);
+ OS << "\n";
+
+ // We successfully added a comment to this instruction.
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 0000000..687581b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,25 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+
+namespace llvm {
+ class MCInst;
+ class raw_ostream;
+ bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..879378f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,257 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (TSFlags & X86II::LOCK)
+ OS << "\tlock\n";
+
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid xopcc argument!");
+ case 0: O << "lt"; break;
+ case 1: O << "le"; break;
+ case 2: O << "gt"; break;
+ case 3: O << "ge"; break;
+ case 4: O << "eq"; break;
+ case 5: O << "neq"; break;
+ case 6: O << "false"; break;
+ case 7: O << "true"; break;
+ }
+}
+
+void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+ switch (Imm) {
+ case 0: O << "{rn-sae}"; break;
+ case 1: O << "{rd-sae}"; break;
+ case 2: O << "{ru-sae}"; break;
+ case 3: O << "{rz-sae}"; break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ }
+ else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ O << formatImm((int64_t)Op.getImm());
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+ const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+X86::AddrSegmentReg, O);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op+X86::AddrBaseReg, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+X86::AddrIndexReg, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << formatImm(DispVal);
+ }
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &SegReg = MI->getOperand(Op+1);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+1, O);
+ O << ':';
+ }
+ O << '[';
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // DI accesses are always ES-based.
+ O << "es:[";
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+ const MCOperand &SegReg = MI->getOperand(Op+1);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+1, O);
+ O << ':';
+ }
+
+ O << '[';
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..20cd7ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,162 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class X86IntelInstPrinter final : public MCInstPrinter {
+public:
+ X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+
+ void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "opaque ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+
+
+ void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 0000000..133bd0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,855 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_SecRel_1:
+ case FK_Data_1:
+ return 0;
+ case FK_PCRel_2:
+ case FK_SecRel_2:
+ case FK_Data_2:
+ return 1;
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_global_offset_table:
+ case FK_SecRel_4:
+ case FK_Data_4:
+ return 2;
+ case FK_PCRel_8:
+ case FK_SecRel_8:
+ case FK_Data_8:
+ case X86::reloc_global_offset_table8:
+ return 3;
+ }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+ bool HasRelocationAddend, bool foobar)
+ : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+ const StringRef CPU;
+ bool HasNopl;
+ uint64_t MaxNopLength;
+public:
+ X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) {
+ HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+ CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+ CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+ CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+ CPU != "c3" && CPU != "c3-2";
+ // Max length of true long nop instruction is 15 bytes.
+ // Max length of long nop replacement instruction is 7 bytes.
+ // Taking into account SilverMont architecture features max length of nops
+ // is reduced for it to achieve better performance.
+ MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15;
+ }
+
+ unsigned getNumFixupKinds() const override {
+ return X86::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+ { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+ { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
+ { "reloc_signed_4byte", 0, 4 * 8, 0},
+ { "reloc_global_offset_table", 0, 4 * 8, 0}
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override {
+ unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+ assert(Fixup.getOffset() + Size <= DataSize &&
+ "Invalid fixup offset!");
+
+ // Check that uppper bits are either all zeros or all ones.
+ // Specifically ignore overflow/underflow as long as the leakage is
+ // limited to the lower bits. This is to remain compatible with
+ // other assemblers.
+ assert(isIntN(Size * 8 + 1, Value) &&
+ "Value does not fit in the Fixup field");
+
+ for (unsigned i = 0; i != Size; ++i)
+ Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+
+ void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+ switch (Op) {
+ default:
+ return Op;
+
+ case X86::JAE_1: return X86::JAE_4;
+ case X86::JA_1: return X86::JA_4;
+ case X86::JBE_1: return X86::JBE_4;
+ case X86::JB_1: return X86::JB_4;
+ case X86::JE_1: return X86::JE_4;
+ case X86::JGE_1: return X86::JGE_4;
+ case X86::JG_1: return X86::JG_4;
+ case X86::JLE_1: return X86::JLE_4;
+ case X86::JL_1: return X86::JL_4;
+ case X86::JMP_1: return X86::JMP_4;
+ case X86::JNE_1: return X86::JNE_4;
+ case X86::JNO_1: return X86::JNO_4;
+ case X86::JNP_1: return X86::JNP_4;
+ case X86::JNS_1: return X86::JNS_4;
+ case X86::JO_1: return X86::JO_4;
+ case X86::JP_1: return X86::JP_4;
+ case X86::JS_1: return X86::JS_4;
+ }
+}
+
+static unsigned getRelaxedOpcodeArith(unsigned Op) {
+ switch (Op) {
+ default:
+ return Op;
+
+ // IMUL
+ case X86::IMUL16rri8: return X86::IMUL16rri;
+ case X86::IMUL16rmi8: return X86::IMUL16rmi;
+ case X86::IMUL32rri8: return X86::IMUL32rri;
+ case X86::IMUL32rmi8: return X86::IMUL32rmi;
+ case X86::IMUL64rri8: return X86::IMUL64rri32;
+ case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+ // AND
+ case X86::AND16ri8: return X86::AND16ri;
+ case X86::AND16mi8: return X86::AND16mi;
+ case X86::AND32ri8: return X86::AND32ri;
+ case X86::AND32mi8: return X86::AND32mi;
+ case X86::AND64ri8: return X86::AND64ri32;
+ case X86::AND64mi8: return X86::AND64mi32;
+
+ // OR
+ case X86::OR16ri8: return X86::OR16ri;
+ case X86::OR16mi8: return X86::OR16mi;
+ case X86::OR32ri8: return X86::OR32ri;
+ case X86::OR32mi8: return X86::OR32mi;
+ case X86::OR64ri8: return X86::OR64ri32;
+ case X86::OR64mi8: return X86::OR64mi32;
+
+ // XOR
+ case X86::XOR16ri8: return X86::XOR16ri;
+ case X86::XOR16mi8: return X86::XOR16mi;
+ case X86::XOR32ri8: return X86::XOR32ri;
+ case X86::XOR32mi8: return X86::XOR32mi;
+ case X86::XOR64ri8: return X86::XOR64ri32;
+ case X86::XOR64mi8: return X86::XOR64mi32;
+
+ // ADD
+ case X86::ADD16ri8: return X86::ADD16ri;
+ case X86::ADD16mi8: return X86::ADD16mi;
+ case X86::ADD32ri8: return X86::ADD32ri;
+ case X86::ADD32mi8: return X86::ADD32mi;
+ case X86::ADD64ri8: return X86::ADD64ri32;
+ case X86::ADD64mi8: return X86::ADD64mi32;
+
+ // ADC
+ case X86::ADC16ri8: return X86::ADC16ri;
+ case X86::ADC16mi8: return X86::ADC16mi;
+ case X86::ADC32ri8: return X86::ADC32ri;
+ case X86::ADC32mi8: return X86::ADC32mi;
+ case X86::ADC64ri8: return X86::ADC64ri32;
+ case X86::ADC64mi8: return X86::ADC64mi32;
+
+ // SUB
+ case X86::SUB16ri8: return X86::SUB16ri;
+ case X86::SUB16mi8: return X86::SUB16mi;
+ case X86::SUB32ri8: return X86::SUB32ri;
+ case X86::SUB32mi8: return X86::SUB32mi;
+ case X86::SUB64ri8: return X86::SUB64ri32;
+ case X86::SUB64mi8: return X86::SUB64mi32;
+
+ // SBB
+ case X86::SBB16ri8: return X86::SBB16ri;
+ case X86::SBB16mi8: return X86::SBB16mi;
+ case X86::SBB32ri8: return X86::SBB32ri;
+ case X86::SBB32mi8: return X86::SBB32mi;
+ case X86::SBB64ri8: return X86::SBB64ri32;
+ case X86::SBB64mi8: return X86::SBB64mi32;
+
+ // CMP
+ case X86::CMP16ri8: return X86::CMP16ri;
+ case X86::CMP16mi8: return X86::CMP16mi;
+ case X86::CMP32ri8: return X86::CMP32ri;
+ case X86::CMP32mi8: return X86::CMP32mi;
+ case X86::CMP64ri8: return X86::CMP64ri32;
+ case X86::CMP64mi8: return X86::CMP64mi32;
+
+ // PUSH
+ case X86::PUSH32i8: return X86::PUSHi32;
+ case X86::PUSH16i8: return X86::PUSHi16;
+ case X86::PUSH64i8: return X86::PUSH64i32;
+ }
+}
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+ unsigned R = getRelaxedOpcodeArith(Op);
+ if (R != Op)
+ return R;
+ return getRelaxedOpcodeBranch(Op);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+ // Branches can always be relaxed.
+ if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+ return true;
+
+ // Check if this instruction is ever relaxable.
+ if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+ return false;
+
+
+ // Check if the relaxable operand has an expression. For the current set of
+ // relaxable instructions, the relaxable operand is always the last operand.
+ unsigned RelaxableOp = Inst.getNumOperands() - 1;
+ if (Inst.getOperand(RelaxableOp).isExpr())
+ return true;
+
+ return false;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // Relax if the value is too big for a (signed) i8.
+ return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+ // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+ unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ Res = Inst;
+ Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ static const uint8_t TrueNops[10][10] = {
+ // nop
+ {0x90},
+ // xchg %ax,%ax
+ {0x66, 0x90},
+ // nopl (%[re]ax)
+ {0x0f, 0x1f, 0x00},
+ // nopl 0(%[re]ax)
+ {0x0f, 0x1f, 0x40, 0x00},
+ // nopl 0(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopw 0(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopl 0L(%[re]ax)
+ {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw 0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw %cs:0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ };
+
+ // Alternative nop instructions for CPUs which don't support long nops.
+ static const uint8_t AltNops[7][10] = {
+ // nop
+ {0x90},
+ // xchg %ax,%ax
+ {0x66, 0x90},
+ // lea 0x0(%esi),%esi
+ {0x8d, 0x76, 0x00},
+ // lea 0x0(%esi),%esi
+ {0x8d, 0x74, 0x26, 0x00},
+ // nop + lea 0x0(%esi),%esi
+ {0x90, 0x8d, 0x74, 0x26, 0x00},
+ // lea 0x0(%esi),%esi
+ {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 },
+ // lea 0x0(%esi),%esi
+ {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00},
+ };
+
+ // Select the right NOP table.
+ // FIXME: Can we get if CPU supports long nops from the subtarget somehow?
+ const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops;
+ assert(HasNopl || MaxNopLength <= 7);
+
+ // Emit as many largest nops as needed, then emit a nop of the remaining
+ // length.
+ do {
+ const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
+ const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+ for (uint8_t i = 0; i < Prefixes; i++)
+ OW->write8(0x66);
+ const uint8_t Rest = ThisNopLength - Prefixes;
+ for (uint8_t i = 0; i < Rest; i++)
+ OW->write8(Nops[Rest - 1][i]);
+ Count -= ThisNopLength;
+ } while (Count != 0);
+
+ return true;
+}
+
+/* *** */
+
+namespace {
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+ uint8_t OSABI;
+ ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+ }
+};
+
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ ELF::EM_X86_64);
+ }
+};
+
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ ELF::EM_IAMCU);
+ }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+ }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+ bool Is64Bit;
+
+public:
+ WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+ : X86AsmBackend(T, CPU)
+ , Is64Bit(is64Bit) {
+ }
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86WinCOFFObjectWriter(OS, Is64Bit);
+ }
+};
+
+namespace CU {
+
+ /// Compact unwind encoding values.
+ enum CompactUnwindEncodings {
+ /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+ /// the return address, then [RE]SP is moved to [RE]BP.
+ UNWIND_MODE_BP_FRAME = 0x01000000,
+
+ /// A frameless function with a small constant stack size.
+ UNWIND_MODE_STACK_IMMD = 0x02000000,
+
+ /// A frameless function with a large constant stack size.
+ UNWIND_MODE_STACK_IND = 0x03000000,
+
+ /// No compact unwind encoding is available.
+ UNWIND_MODE_DWARF = 0x04000000,
+
+ /// Mask for encoding the frame registers.
+ UNWIND_BP_FRAME_REGISTERS = 0x00007FFF,
+
+ /// Mask for encoding the frameless registers.
+ UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+ };
+
+} // end CU namespace
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+ const MCRegisterInfo &MRI;
+
+ /// \brief Number of registers that can be saved in a compact unwind encoding.
+ enum { CU_NUM_SAVED_REGS = 6 };
+
+ mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+ bool Is64Bit;
+
+ unsigned OffsetSize; ///< Offset of a "push" instruction.
+ unsigned MoveInstrSize; ///< Size of a "move" instruction.
+ unsigned StackDivide; ///< Amount to adjust stack size by.
+protected:
+ /// \brief Size of a "push" instruction for the given register.
+ unsigned PushInstrSize(unsigned Reg) const {
+ switch (Reg) {
+ case X86::EBX:
+ case X86::ECX:
+ case X86::EDX:
+ case X86::EDI:
+ case X86::ESI:
+ case X86::EBP:
+ case X86::RBX:
+ case X86::RBP:
+ return 1;
+ case X86::R12:
+ case X86::R13:
+ case X86::R14:
+ case X86::R15:
+ return 2;
+ }
+ return 1;
+ }
+
+ /// \brief Implementation of algorithm to generate the compact unwind encoding
+ /// for the CFI instructions.
+ uint32_t
+ generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+ if (Instrs.empty()) return 0;
+
+ // Reset the saved registers.
+ unsigned SavedRegIdx = 0;
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+
+ bool HasFP = false;
+
+ // Encode that we are using EBP/RBP as the frame pointer.
+ uint32_t CompactUnwindEncoding = 0;
+
+ unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+ unsigned InstrOffset = 0;
+ unsigned StackAdjust = 0;
+ unsigned StackSize = 0;
+ unsigned PrevStackSize = 0;
+ unsigned NumDefCFAOffsets = 0;
+
+ for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+ const MCCFIInstruction &Inst = Instrs[i];
+
+ switch (Inst.getOperation()) {
+ default:
+ // Any other CFI directives indicate a frame that we aren't prepared
+ // to represent via compact unwind, so just bail out.
+ return 0;
+ case MCCFIInstruction::OpDefCfaRegister: {
+ // Defines a frame pointer. E.g.
+ //
+ // movq %rsp, %rbp
+ // L0:
+ // .cfi_def_cfa_register %rbp
+ //
+ HasFP = true;
+ assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
+ (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+ // Reset the counts.
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ StackAdjust = 0;
+ SavedRegIdx = 0;
+ InstrOffset += MoveInstrSize;
+ break;
+ }
+ case MCCFIInstruction::OpDefCfaOffset: {
+ // Defines a new offset for the CFA. E.g.
+ //
+ // With frame:
+ //
+ // pushq %rbp
+ // L0:
+ // .cfi_def_cfa_offset 16
+ //
+ // Without frame:
+ //
+ // subq $72, %rsp
+ // L0:
+ // .cfi_def_cfa_offset 80
+ //
+ PrevStackSize = StackSize;
+ StackSize = std::abs(Inst.getOffset()) / StackDivide;
+ ++NumDefCFAOffsets;
+ break;
+ }
+ case MCCFIInstruction::OpOffset: {
+ // Defines a "push" of a callee-saved register. E.g.
+ //
+ // pushq %r15
+ // pushq %r14
+ // pushq %rbx
+ // L0:
+ // subq $120, %rsp
+ // L1:
+ // .cfi_offset %rbx, -40
+ // .cfi_offset %r14, -32
+ // .cfi_offset %r15, -24
+ //
+ if (SavedRegIdx == CU_NUM_SAVED_REGS)
+ // If there are too many saved registers, we cannot use a compact
+ // unwind encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ SavedRegs[SavedRegIdx++] = Reg;
+ StackAdjust += OffsetSize;
+ InstrOffset += PushInstrSize(Reg);
+ break;
+ }
+ }
+ }
+
+ StackAdjust /= StackDivide;
+
+ if (HasFP) {
+ if ((StackAdjust & 0xFF) != StackAdjust)
+ // Offset was too big for a compact unwind encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Get the encoding of the saved registers when we have a frame pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+ CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+ CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+ } else {
+ // If the amount of the stack allocation is the size of a register, then
+ // we "push" the RAX/EAX register onto the stack instead of adjusting the
+ // stack pointer with a SUB instruction. We don't support the push of the
+ // RAX/EAX register with compact unwind. So we check for that situation
+ // here.
+ if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+ StackSize - PrevStackSize == 1) ||
+ (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+ return CU::UNWIND_MODE_DWARF;
+
+ SubtractInstrIdx += InstrOffset;
+ ++StackAdjust;
+
+ if ((StackSize & 0xFF) == StackSize) {
+ // Frameless stack with a small stack size.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+ // Encode the stack size.
+ CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+ } else {
+ if ((StackAdjust & 0x7) != StackAdjust)
+ // The extra stack adjustments are too big for us to handle.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Frameless stack with an offset too large for us to encode compactly.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+ // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+ // instruction.
+ CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+ // Encode any extra stack stack adjustments (done via push
+ // instructions).
+ CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+ }
+
+ // Encode the number of registers saved. (Reverse the list first.)
+ std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+ CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+ // Get the encoding of the saved registers when we don't have a frame
+ // pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ // Encode the register encoding.
+ CompactUnwindEncoding |=
+ RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+ }
+
+ return CompactUnwindEncoding;
+ }
+
+private:
+ /// \brief Get the compact unwind number for a given register. The number
+ /// corresponds to the enum lists in compact_unwind_encoding.h.
+ int getCompactUnwindRegNum(unsigned Reg) const {
+ static const MCPhysReg CU32BitRegs[7] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const MCPhysReg CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+ for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+ if (*CURegs == Reg)
+ return Idx;
+
+ return -1;
+ }
+
+ /// \brief Return the registers encoded for a compact encoding with a frame
+ /// pointer.
+ uint32_t encodeCompactUnwindRegistersWithFrame() const {
+ // Encode the registers in the order they were saved --- 3-bits per
+ // register. The list of saved registers is assumed to be in reverse
+ // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+ uint32_t RegEnc = 0;
+ for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+ unsigned Reg = SavedRegs[i];
+ if (Reg == 0) break;
+
+ int CURegNum = getCompactUnwindRegNum(Reg);
+ if (CURegNum == -1) return ~0U;
+
+ // Encode the 3-bit register number in order, skipping over 3-bits for
+ // each register.
+ RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+ }
+
+ assert((RegEnc & 0x3FFFF) == RegEnc &&
+ "Invalid compact register encoding!");
+ return RegEnc;
+ }
+
+ /// \brief Create the permutation encoding used with frameless stacks. It is
+ /// passed the number of registers to be saved and an array of the registers
+ /// saved.
+ uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+ // The saved registers are numbered from 1 to 6. In order to encode the
+ // order in which they were saved, we re-number them according to their
+ // place in the register order. The re-numbering is relative to the last
+ // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+ // that order:
+ //
+ // Orig Re-Num
+ // ---- ------
+ // 6 6
+ // 2 2
+ // 4 3
+ // 5 3
+ //
+ for (unsigned i = 0; i < RegCount; ++i) {
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+ if (CUReg == -1) return ~0U;
+ SavedRegs[i] = CUReg;
+ }
+
+ // Reverse the list.
+ std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+ uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+ for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+ unsigned Countless = 0;
+ for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+ if (SavedRegs[j] < SavedRegs[i])
+ ++Countless;
+
+ RenumRegs[i] = SavedRegs[i] - Countless - 1;
+ }
+
+ // Take the renumbered values and encode them into a 10-bit number.
+ uint32_t permutationEncoding = 0;
+ switch (RegCount) {
+ case 6:
+ permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+ + 6 * RenumRegs[2] + 2 * RenumRegs[3]
+ + RenumRegs[4];
+ break;
+ case 5:
+ permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+ + 6 * RenumRegs[3] + 2 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 4:
+ permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3]
+ + 3 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 3:
+ permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 2:
+ permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 1:
+ permutationEncoding |= RenumRegs[5];
+ break;
+ }
+
+ assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+ "Invalid compact register encoding!");
+ return permutationEncoding;
+ }
+
+public:
+ DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+ bool Is64Bit)
+ : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ OffsetSize = Is64Bit ? 8 : 4;
+ MoveInstrSize = Is64Bit ? 3 : 2;
+ StackDivide = Is64Bit ? 8 : 4;
+ }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+ DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ StringRef CPU)
+ : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+ MachO::CPU_TYPE_I386,
+ MachO::CPU_SUBTYPE_I386_ALL);
+ }
+
+ /// \brief Generate the compact unwind encoding for the CFI instructions.
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override {
+ return generateCompactUnwindEncodingImpl(Instrs);
+ }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+ const MachO::CPUSubTypeX86 Subtype;
+public:
+ DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ StringRef CPU, MachO::CPUSubTypeX86 st)
+ : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+ MachO::CPU_TYPE_X86_64, Subtype);
+ }
+
+ /// \brief Generate the compact unwind encoding for the CFI instructions.
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override {
+ return generateCompactUnwindEncodingImpl(Instrs);
+ }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ StringRef CPU) {
+ if (TheTriple.isOSBinFormatMachO())
+ return new DarwinX86_32AsmBackend(T, MRI, CPU);
+
+ if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+ return new WindowsX86AsmBackend(T, false, CPU);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.isOSIAMCU())
+ return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+
+ return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ StringRef CPU) {
+ if (TheTriple.isOSBinFormatMachO()) {
+ MachO::CPUSubTypeX86 CS =
+ StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+ .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+ .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+ return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+ }
+
+ if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+ return new WindowsX86AsmBackend(T, true, CPU);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.getEnvironment() == Triple::GNUX32)
+ return new ELFX86_X32AsmBackend(T, OSABI, CPU);
+ return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 0000000..9ff85b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,779 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+ // Enums for memory operand decoding. Each memory operand is represented with
+ // a 5 operand sequence in the form:
+ // [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+ // These enums help decode this.
+ enum {
+ AddrBaseReg = 0,
+ AddrScaleAmt = 1,
+ AddrIndexReg = 2,
+ AddrDisp = 3,
+
+ /// AddrSegmentReg - The operand # of the segment in the memory operand.
+ AddrSegmentReg = 4,
+
+ /// AddrNumOperands - Total number of operands in a memory reference.
+ AddrNumOperands = 5
+ };
+
+ /// AVX512 static rounding constants. These need to match the values in
+ /// avx512fintrin.h.
+ enum STATIC_ROUNDING {
+ TO_NEAREST_INT = 0,
+ TO_NEG_INF = 1,
+ TO_POS_INF = 2,
+ TO_ZERO = 3,
+ CUR_DIRECTION = 4
+ };
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // X86 Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+ /// relocation of:
+ /// SYMBOL_LABEL + [. - PICBASELABEL]
+ MO_GOT_ABSOLUTE_ADDRESS,
+
+ /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+ /// immediate should get the value of the symbol minus the PIC base label:
+ /// SYMBOL_LABEL - PICBASELABEL
+ MO_PIC_BASE_OFFSET,
+
+ /// MO_GOT - On a symbol operand this indicates that the immediate is the
+ /// offset to the GOT entry for the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOT
+ MO_GOT,
+
+ /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+ /// the offset to the location of the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTOFF
+ MO_GOTOFF,
+
+ /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+ /// offset to the GOT entry for the symbol name from the current code
+ /// location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTPCREL
+ MO_GOTPCREL,
+
+ /// MO_PLT - On a symbol operand this indicates that the immediate is
+ /// offset to the PLT entry of symbol name from the current code location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @PLT
+ MO_PLT,
+
+ /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index structure that contains
+ /// the module number and variable offset for the symbol. Used in the
+ /// general dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSGD
+ MO_TLSGD,
+
+ /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// __tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLD
+ MO_TLSLD,
+
+ /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// ___tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLDM
+ MO_TLSLDM,
+
+ /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the thread-pointer offset for the
+ /// symbol. Used in the x86-64 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTTPOFF
+ MO_GOTTPOFF,
+
+ /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the absolute address of the GOT entry with the negative thread-pointer
+ /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+ /// model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @INDNTPOFF
+ MO_INDNTPOFF,
+
+ /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+ /// the thread-pointer offset for the symbol. Used in the x86-64 local
+ /// exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TPOFF
+ MO_TPOFF,
+
+ /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS offset of the symbol. Used
+ /// in the local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @DTPOFF
+ MO_DTPOFF,
+
+ /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+ /// the negative thread-pointer offset for the symbol. Used in the IA32
+ /// local exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @NTPOFF
+ MO_NTPOFF,
+
+ /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the negative thread-pointer offset for
+ /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTNTPOFF
+ MO_GOTNTPOFF,
+
+ /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "__imp_FOO" symbol. This is used for
+ /// dllimport linkage on windows.
+ MO_DLLIMPORT,
+
+ /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$stub" symbol. This is used for calls
+ /// and jumps to external functions on Tiger and earlier.
+ MO_DARWIN_STUB,
+
+ /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+ /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY,
+
+ /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+ /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+ /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY_PIC_BASE,
+
+ /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+ /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+ /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+ /// stub.
+ MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+ /// MO_TLVP - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// This is the TLS offset for the Darwin TLS mechanism.
+ MO_TLVP,
+
+ /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+ /// is some TLS offset from the picbase.
+ ///
+ /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+ MO_TLVP_PIC_BASE,
+
+ /// MO_SECREL - On a symbol operand this indicates that the immediate is
+ /// the offset from beginning of section.
+ ///
+ /// This is the TLS offset for the COFF/Windows TLS mechanism.
+ MO_SECREL
+ };
+
+ enum : uint64_t {
+ //===------------------------------------------------------------------===//
+ // Instruction encodings. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 3,
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 4,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 5,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 6,
+
+ /// RawFrmMemOffs - This form is for instructions that store an absolute
+ /// memory offset as an immediate with a possible segment override.
+ RawFrmMemOffs = 7,
+
+ /// RawFrmSrc - This form is for instructions that use the source index
+ /// register SI/ESI/RSI with a possible segment override.
+ RawFrmSrc = 8,
+
+ /// RawFrmDst - This form is for instructions that use the destination index
+ /// register DI/EDI/ESI.
+ RawFrmDst = 9,
+
+ /// RawFrmSrc - This form is for instructions that use the source index
+ /// register SI/ESI/ERI with a possible segment override, and also the
+ /// destination index register DI/ESI/RDI.
+ RawFrmDstSrc = 10,
+
+ /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+ /// immediates, the first of which is a 16-bit immediate (specified by
+ /// the imm encoding) and the second is a 8-bit fixed value.
+ RawFrmImm8 = 11,
+
+ /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+ /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+ /// the imm encoding) and the second is a 16-bit fixed value. In the AMD
+ /// manual, this operand is described as pntr16:32 and pntr16:16
+ RawFrmImm16 = 12,
+
+ /// MRMX[rm] - The forms are used to represent instructions that use a
+ /// Mod/RM byte, and don't use the middle field for anything.
+ MRMXr = 14, MRMXm = 15,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ // First, instructions that operate on a register r/m operand...
+ MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3
+ MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3
+ MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7
+
+ //// MRM_XX - A mod/rm byte of exactly 0xXX.
+ MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
+ MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39,
+ MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43,
+ MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47,
+ MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51,
+ MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55,
+ MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59,
+ MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63,
+ MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67,
+ MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71,
+ MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75,
+ MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79,
+ MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83,
+ MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87,
+ MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91,
+ MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95,
+
+ FormMask = 127,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+ // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+ // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+ // prefix in 16-bit mode.
+ OpSizeShift = 7,
+ OpSizeMask = 0x3 << OpSizeShift,
+
+ OpSizeFixed = 0 << OpSizeShift,
+ OpSize16 = 1 << OpSizeShift,
+ OpSize32 = 2 << OpSizeShift,
+
+ // AsSize - AdSizeX implies this instruction determines its need of 0x67
+ // prefix from a normal ModRM memory operand. The other types indicate that
+ // an operand is encoded with a specific width and a prefix is needed if
+ // it differs from the current mode.
+ AdSizeShift = OpSizeShift + 2,
+ AdSizeMask = 0x3 << AdSizeShift,
+
+ AdSizeX = 1 << AdSizeShift,
+ AdSize16 = 1 << AdSizeShift,
+ AdSize32 = 2 << AdSizeShift,
+ AdSize64 = 3 << AdSizeShift,
+
+ //===------------------------------------------------------------------===//
+ // OpPrefix - There are several prefix bytes that are used as opcode
+ // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+ // no prefix.
+ //
+ OpPrefixShift = AdSizeShift + 2,
+ OpPrefixMask = 0x7 << OpPrefixShift,
+
+ // PS, PD - Prefix code for packed single and double precision vector
+ // floating point operations performed in the SSE registers.
+ PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XS = 3 << OpPrefixShift, XD = 4 << OpPrefixShift,
+
+ //===------------------------------------------------------------------===//
+ // OpMap - This field determines which opcode map this instruction
+ // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+ //
+ OpMapShift = OpPrefixShift + 3,
+ OpMapMask = 0x7 << OpMapShift,
+
+ // OB - OneByte - Set if this instruction has a one byte opcode.
+ OB = 0 << OpMapShift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << OpMapShift,
+
+ // T8, TA - Prefix after the 0x0F prefix.
+ T8 = 2 << OpMapShift, TA = 3 << OpMapShift,
+
+ // XOP8 - Prefix to include use of imm byte.
+ XOP8 = 4 << OpMapShift,
+
+ // XOP9 - Prefix to exclude use of imm byte.
+ XOP9 = 5 << OpMapShift,
+
+ // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+ XOPA = 6 << OpMapShift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = OpMapShift + 3,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = REXShift + 1,
+ ImmMask = 15 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm8PCRel = 2 << ImmShift,
+ Imm16 = 3 << ImmShift,
+ Imm16PCRel = 4 << ImmShift,
+ Imm32 = 5 << ImmShift,
+ Imm32PCRel = 6 << ImmShift,
+ Imm32S = 7 << ImmShift,
+ Imm64 = 8 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = ImmShift + 4,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Lock prefix
+ LOCKShift = FPTypeShift + 3,
+ LOCK = 1 << LOCKShift,
+
+ // REP prefix
+ REPShift = LOCKShift + 1,
+ REP = 1 << REPShift,
+
+ // Execution domain for SSE instructions.
+ // 0 means normal, non-SSE instruction.
+ SSEDomainShift = REPShift + 1,
+
+ // Encoding
+ EncodingShift = SSEDomainShift + 2,
+ EncodingMask = 0x3 << EncodingShift,
+
+ // VEX - encoding using 0xC4/0xC5
+ VEX = 1 << EncodingShift,
+
+ /// XOP - Opcode prefix used by XOP instructions.
+ XOP = 2 << EncodingShift,
+
+ // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+ // syntax support up to 32 512-bit register operands and up to 7 16-bit
+ // mask operands as well as source operand data swizzling/memory operand
+ // conversion, eviction hint, and rounding mode.
+ EVEX = 3 << EncodingShift,
+
+ // Opcode
+ OpcodeShift = EncodingShift + 2,
+
+ /// VEX_W - Has a opcode specific functionality, but is used in the same
+ /// way as REX_W is for regular SSE instructions.
+ VEX_WShift = OpcodeShift + 8,
+ VEX_W = 1ULL << VEX_WShift,
+
+ /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+ /// address instructions in SSE are represented as 3 address ones in AVX
+ /// and the additional register is encoded in VEX_VVVV prefix.
+ VEX_4VShift = VEX_WShift + 1,
+ VEX_4V = 1ULL << VEX_4VShift,
+
+ /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
+ /// operand 3 with VEX.vvvv.
+ VEX_4VOp3Shift = VEX_4VShift + 1,
+ VEX_4VOp3 = 1ULL << VEX_4VOp3Shift,
+
+ /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+ /// must be encoded in the i8 immediate field. This usually happens in
+ /// instructions with 4 operands.
+ VEX_I8IMMShift = VEX_4VOp3Shift + 1,
+ VEX_I8IMM = 1ULL << VEX_I8IMMShift,
+
+ /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+ /// instruction uses 256-bit wide registers. This is usually auto detected
+ /// if a VR256 register is used, but some AVX instructions also have this
+ /// field marked when using a f256 memory references.
+ VEX_LShift = VEX_I8IMMShift + 1,
+ VEX_L = 1ULL << VEX_LShift,
+
+ // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+ // prefix. Usually used for scalar instructions. Needed by disassembler.
+ VEX_LIGShift = VEX_LShift + 1,
+ VEX_LIG = 1ULL << VEX_LIGShift,
+
+ // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
+ // with following encoding:
+ // - 00 V128
+ // - 01 V256
+ // - 10 V512
+ // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
+ // this will save 1 tsflag bit
+
+ // EVEX_K - Set if this instruction requires masking
+ EVEX_KShift = VEX_LIGShift + 1,
+ EVEX_K = 1ULL << EVEX_KShift,
+
+ // EVEX_Z - Set if this instruction has EVEX.Z field set.
+ EVEX_ZShift = EVEX_KShift + 1,
+ EVEX_Z = 1ULL << EVEX_ZShift,
+
+ // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+ EVEX_L2Shift = EVEX_ZShift + 1,
+ EVEX_L2 = 1ULL << EVEX_L2Shift,
+
+ // EVEX_B - Set if this instruction has EVEX.B field set.
+ EVEX_BShift = EVEX_L2Shift + 1,
+ EVEX_B = 1ULL << EVEX_BShift,
+
+ // The scaling factor for the AVX512's 8-bit compressed displacement.
+ CD8_Scale_Shift = EVEX_BShift + 1,
+ CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
+
+ /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+ /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
+ /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+ /// storing a classifier in the imm8 field. To simplify our implementation,
+ /// we handle this by storeing the classifier in the opcode field and using
+ /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+ Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
+ Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
+
+ /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
+ /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
+ MemOp4Shift = Has3DNow0F0FOpcodeShift + 1,
+ MemOp4 = 1ULL << MemOp4Shift,
+
+ /// Explicitly specified rounding control
+ EVEX_RCShift = MemOp4Shift + 1,
+ EVEX_RC = 1ULL << EVEX_RCShift
+ };
+
+ // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+ // specified machine instruction.
+ //
+ inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+ return TSFlags >> X86II::OpcodeShift;
+ }
+
+ inline bool hasImm(uint64_t TSFlags) {
+ return (TSFlags & X86II::ImmMask) != 0;
+ }
+
+ /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+ /// of the specified instruction.
+ inline unsigned getSizeOfImm(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8:
+ case X86II::Imm8PCRel: return 1;
+ case X86II::Imm16:
+ case X86II::Imm16PCRel: return 2;
+ case X86II::Imm32:
+ case X86II::Imm32S:
+ case X86II::Imm32PCRel: return 4;
+ case X86II::Imm64: return 8;
+ }
+ }
+
+ /// isImmPCRel - Return true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is pc relative.
+ inline unsigned isImmPCRel(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8PCRel:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32PCRel:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm16:
+ case X86II::Imm32:
+ case X86II::Imm32S:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// isImmSigned - Return true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is signed.
+ inline unsigned isImmSigned(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate signedness");
+ case X86II::Imm32S:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm8PCRel:
+ case X86II::Imm16:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32:
+ case X86II::Imm32PCRel:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// getOperandBias - compute any additional adjustment needed to
+ /// the offset to the start of the memory operand
+ /// in this instruction.
+ /// If this is a two-address instruction,skip one of the register operands.
+ /// FIXME: This should be handled during MCInst lowering.
+ inline int getOperandBias(const MCInstrDesc& Desc)
+ {
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ // Special case for AVX-512 GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ // SCATTER
+ ++CurOp;
+ return CurOp;
+ }
+
+ /// getMemoryOperandNo - The function returns the MCInst operand # for the
+ /// first field of the memory operand. If the instruction doesn't have a
+ /// memory operand, this returns -1.
+ ///
+ /// Note that this ignores tied operands. If there is a tied register which
+ /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+ /// counted as one operand.
+ ///
+ inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasMemOp4 = TSFlags & X86II::MemOp4;
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+ case X86II::Pseudo:
+ case X86II::RawFrm:
+ case X86II::AddRegFrm:
+ case X86II::MRMDestReg:
+ case X86II::MRMSrcReg:
+ case X86II::RawFrmImm8:
+ case X86II::RawFrmImm16:
+ case X86II::RawFrmMemOffs:
+ case X86II::RawFrmSrc:
+ case X86II::RawFrmDst:
+ case X86II::RawFrmDstSrc:
+ return -1;
+ case X86II::MRMDestMem:
+ return 0;
+ case X86II::MRMSrcMem:
+ // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+ // mask register.
+ return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K;
+ case X86II::MRMXr:
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ return -1;
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
+ return 0 + HasVEX_4V + HasEVEX_K;
+ case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+ case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+ case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+ case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+ case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+ case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+ case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+ case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+ case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+ case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+ case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+ case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+ case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+ case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+ case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+ case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+ case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_FF:
+ return -1;
+ }
+ }
+
+ /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+ /// higher) register? e.g. r8, xmm8, xmm13, etc.
+ inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
+ (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
+ (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
+ (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
+ (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
+ (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+ return true;
+
+ switch (RegNo) {
+ default: break;
+ case X86::R8: case X86::R9: case X86::R10: case X86::R11:
+ case X86::R12: case X86::R13: case X86::R14: case X86::R15:
+ case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D:
+ case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D:
+ case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W:
+ case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
+ case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
+ case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
+ case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
+ case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
+ return true;
+ }
+ return false;
+ }
+
+ /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+ /// registers? e.g. zmm21, etc.
+ static inline bool is32ExtendedReg(unsigned RegNo) {
+ return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
+ (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
+ (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+ }
+
+
+ inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+ }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 0000000..736c39d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,262 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+ class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+ X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+ ~X86ELFObjectWriter() override;
+
+ protected:
+ unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsPCRel) const override;
+ };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+ uint16_t EMachine)
+ : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+ // Only i386 and IAMCU use Rel instead of RelA.
+ /*HasRelocationAddend*/
+ (EMachine != ELF::EM_386) &&
+ (EMachine != ELF::EM_IAMCU)) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+
+static X86_64RelType getType64(unsigned Kind,
+ MCSymbolRefExpr::VariantKind &Modifier,
+ bool &IsPCRel) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case X86::reloc_global_offset_table8:
+ Modifier = MCSymbolRefExpr::VK_GOT;
+ IsPCRel = true;
+ return RT64_64;
+ case FK_Data_8:
+ return RT64_64;
+ case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel)
+ return RT64_32S;
+ return RT64_32;
+ case X86::reloc_global_offset_table:
+ Modifier = MCSymbolRefExpr::VK_GOT;
+ IsPCRel = true;
+ return RT64_32;
+ case FK_Data_4:
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return RT64_32;
+ case FK_PCRel_2:
+ case FK_Data_2:
+ return RT64_16;
+ case FK_PCRel_1:
+ case FK_Data_1:
+ return RT64_8;
+ }
+}
+
+static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier,
+ X86_64RelType Type, bool IsPCRel) {
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ switch (Type) {
+ case RT64_64:
+ return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+ case RT64_32:
+ return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
+ case RT64_32S:
+ return ELF::R_X86_64_32S;
+ case RT64_16:
+ return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16;
+ case RT64_8:
+ return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
+ }
+ case MCSymbolRefExpr::VK_GOT:
+ switch (Type) {
+ case RT64_64:
+ return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64;
+ case RT64_32:
+ return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(Type == RT64_64);
+ assert(!IsPCRel);
+ return ELF::R_X86_64_GOTOFF64;
+ case MCSymbolRefExpr::VK_TPOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_TPOFF64;
+ case RT64_32:
+ return ELF::R_X86_64_TPOFF32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_DTPOFF64;
+ case RT64_32:
+ return ELF::R_X86_64_DTPOFF32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_SIZE:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_SIZE64;
+ case RT64_32:
+ return ELF::R_X86_64_SIZE32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ llvm_unreachable("Unimplemented");
+ }
+ case MCSymbolRefExpr::VK_TLSGD:
+ assert(Type == RT64_32);
+ return ELF::R_X86_64_TLSGD;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ assert(Type == RT64_32);
+ return ELF::R_X86_64_GOTTPOFF;
+ case MCSymbolRefExpr::VK_TLSLD:
+ assert(Type == RT64_32);
+ return ELF::R_X86_64_TLSLD;
+ case MCSymbolRefExpr::VK_PLT:
+ assert(Type == RT64_32);
+ return ELF::R_X86_64_PLT32;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ assert(Type == RT64_32);
+ return ELF::R_X86_64_GOTPCREL;
+ }
+}
+
+enum X86_32RelType { RT32_32, RT32_16, RT32_8 };
+
+static X86_32RelType getType32(X86_64RelType T) {
+ switch (T) {
+ case RT64_64:
+ llvm_unreachable("Unimplemented");
+ case RT64_32:
+ case RT64_32S:
+ return RT32_32;
+ case RT64_16:
+ return RT32_16;
+ case RT64_8:
+ return RT32_8;
+ }
+ llvm_unreachable("unexpected relocation type!");
+}
+
+static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier,
+ X86_32RelType Type, bool IsPCRel) {
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ switch (Type) {
+ case RT32_32:
+ return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
+ case RT32_16:
+ return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16;
+ case RT32_8:
+ return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
+ }
+ case MCSymbolRefExpr::VK_GOT:
+ assert(Type == RT32_32);
+ return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32;
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_GOTOFF;
+ case MCSymbolRefExpr::VK_TPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LE_32;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LDO_32;
+ case MCSymbolRefExpr::VK_TLSGD:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_GD;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_IE_32;
+ case MCSymbolRefExpr::VK_PLT:
+ assert(Type == RT32_32);
+ return ELF::R_386_PLT32;
+ case MCSymbolRefExpr::VK_INDNTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_IE;
+ case MCSymbolRefExpr::VK_NTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LE;
+ case MCSymbolRefExpr::VK_GOTNTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_GOTIE;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LDM;
+ }
+}
+
+unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+ X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel);
+ if (getEMachine() == ELF::EM_X86_64)
+ return getRelocType64(Modifier, Type, IsPCRel);
+
+ assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+ "Unsupported ELF machine type.");
+ return getRelocType32(Modifier, getType32(Type), IsPCRel);
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS,
+ bool IsELF64, uint8_t OSABI,
+ uint16_t EMachine) {
+ MCELFObjectTargetWriter *MOTW =
+ new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
new file mode 100644
index 0000000..ddb764f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -0,0 +1,141 @@
+//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace object;
+using namespace ELF;
+
+namespace {
+class X86_64ELFRelocationInfo : public MCRelocationInfo {
+public:
+ X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+ const MCExpr *createExprForRelocation(RelocationRef Rel) override {
+ uint64_t RelType = Rel.getType();
+ elf_symbol_iterator SymI = Rel.getSymbol();
+
+ ErrorOr<StringRef> SymNameOrErr = SymI->getName();
+ if (std::error_code EC = SymNameOrErr.getError())
+ report_fatal_error(EC.message());
+ StringRef SymName = *SymNameOrErr;
+
+ ErrorOr<uint64_t> SymAddr = SymI->getAddress();
+ if (std::error_code EC = SymAddr.getError())
+ report_fatal_error(EC.message());
+ uint64_t SymSize = SymI->getSize();
+ int64_t Addend = *ELFRelocationRef(Rel).getAddend();
+
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
+ // FIXME: check that the value is actually the same.
+ if (!Sym->isVariable())
+ Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx));
+
+ const MCExpr *Expr = nullptr;
+ // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
+ bool hasAddend = false;
+
+ // The AMD64 SysV ABI says:
+ // A: the addend used to compute the value of the relocatable field.
+ // B: the base address at which a shared object has been loaded into memory
+ // during execution. Generally, a shared object is built with a 0 base
+ // virtual address, but the execution address will be different.
+ // G: the offset into the global offset table at which the relocation
+ // entry's symbol will reside during execution.
+ // GOT: the address of the global offset table.
+ // L: the place (section offset or address) of the Procedure Linkage Table
+ // entry for a symbol.
+ // P: the place (section offset or address) of the storage unit being
+ // relocated (computed using r_offset).
+ // S: the value of the symbol whose index resides in the relocation entry.
+ // Z: the size of the symbol whose index resides in the relocation entry.
+
+ switch(RelType) {
+ case R_X86_64_NONE:
+ case R_X86_64_COPY:
+ // none
+ break;
+ case R_X86_64_64:
+ case R_X86_64_16:
+ case R_X86_64_8:
+ // S + A
+ case R_X86_64_32:
+ case R_X86_64_32S:
+ // S + A (We don't care about the result not fitting in 32 bits.)
+ case R_X86_64_PC32:
+ case R_X86_64_PC16:
+ case R_X86_64_PC8:
+ case R_X86_64_PC64:
+ // S + A - P (P/pcrel is implicit)
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ break;
+ case R_X86_64_GOT32:
+ case R_X86_64_GOT64:
+ case R_X86_64_GOTPC32:
+ case R_X86_64_GOTPC64:
+ case R_X86_64_GOTPLT64:
+ // G + A
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+ break;
+ case R_X86_64_PLT32:
+ // L + A - P -> S@PLT + A
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+ break;
+ case R_X86_64_GLOB_DAT:
+ case R_X86_64_JUMP_SLOT:
+ // S
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ break;
+ case R_X86_64_GOTPCREL:
+ case R_X86_64_GOTPCREL64:
+ // G + GOT + A - P -> S@GOTPCREL + A
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ break;
+ case R_X86_64_GOTOFF64:
+ // S + A - GOT
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+ break;
+ case R_X86_64_PLTOFF64:
+ // L + A - GOT
+ break;
+ case R_X86_64_SIZE32:
+ case R_X86_64_SIZE64:
+ // Z + A
+ Expr = MCConstantExpr::create(SymSize, Ctx);
+ break;
+ default:
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ break;
+ }
+ if (Expr && hasAddend && Addend != 0)
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(Addend, Ctx),
+ Ctx);
+ return Expr;
+ }
+};
+} // End unnamed namespace
+
+/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
+ // We only handle x86-64 for now.
+ return new X86_64ELFRelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 0000000..4899900
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,34 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+ reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+ reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq
+ reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4
+ // this will be sign extended at
+ // runtime.
+ reloc_global_offset_table, // 32-bit, relative to the start
+ // of the instruction. Used only
+ // for _GLOBAL_OFFSET_TABLE_.
+ reloc_global_offset_table8, // 64-bit variant.
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..fc0b0f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,172 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+ clEnumValEnd));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+ cl::desc("Mark code section jump table data regions."),
+ cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ if (is64Bit)
+ PointerSize = CalleeSaveStackSlotSize = 8;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ if (!is64Bit)
+ Data64bitsDirective = nullptr; // we can't emit a 64-bit unit
+
+ // Use ## as a comment string so that .s files generated by llvm can go
+ // through the GCC preprocessor without causing an error. This is needed
+ // because "clang foo.s" runs the C preprocessor, which is usually reserved
+ // for .S files on other systems. Perhaps this is because the file system
+ // wasn't always case preserving or something.
+ CommentString = "##";
+
+ SupportsDebugInformation = true;
+ UseDataRegionDirectives = MarkedJTDataRegions;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // old assembler lacks some directives
+ // FIXME: this should really be a check on the assembler characteristics
+ // rather than OS version
+ if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+ HasWeakDefCanBeHiddenDirective = false;
+
+ // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+ // (actually, must, since otherwise the non-extern relocations we produce
+ // overwhelm ld64's tiny little mind and it fails).
+ DwarfFDESymbolsUseAbsDiff = true;
+
+ UseIntegratedAssembler = true;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+ : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+ // For ELF, x86-64 pointer size depends on the ABI.
+ // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+ // with the x32 ABI, pointer size remains the default 4.
+ PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+ // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+ CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // Always enable the integrated assembler by default.
+ // Clang also enabled it when the OS is Solaris but that is redundant here.
+ UseIntegratedAssembler = true;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ MCContext &Context = Streamer.getContext();
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+ const MCExpr *Four = MCConstantExpr::create(4, Context);
+ return MCBinaryExpr::createAdd(Res, Four, Context);
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+ if (Triple.getArch() == Triple::x86_64) {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ PointerSize = 8;
+ WinEHEncodingType = WinEH::EncodingType::Itanium;
+ } else {
+ // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
+ // a place holder that the Windows EHStreamer looks for to suppress CFI
+ // output. In particular, usesWindowsCFI() returns false.
+ WinEHEncodingType = WinEH::EncodingType::X86;
+ }
+
+ ExceptionsType = ExceptionHandling::WinEH;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ AllowAtInName = true;
+
+ UseIntegratedAssembler = true;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+ assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
+ if (Triple.getArch() == Triple::x86_64) {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ PointerSize = 8;
+ WinEHEncodingType = WinEH::EncodingType::Itanium;
+ ExceptionsType = ExceptionHandling::WinEH;
+ } else {
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ }
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 0000000..30d5c80
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,61 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+ explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+ const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000..dfab6ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1506 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+ X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+ void operator=(const X86MCCodeEmitter &) = delete;
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+public:
+ X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), Ctx(ctx) {
+ }
+
+ ~X86MCCodeEmitter() override {}
+
+ bool is64BitMode(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[X86::Mode64Bit];
+ }
+
+ bool is32BitMode(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[X86::Mode32Bit];
+ }
+
+ bool is16BitMode(const MCSubtargetInfo &STI) const {
+ return STI.getFeatureBits()[X86::Mode16Bit];
+ }
+
+ /// Is16BitMemOperand - Return true if the specified instruction has
+ /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+ bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
+
+ if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
+ Disp.isImm() && Disp.getImm() < 0x10000)
+ return true;
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+ }
+
+ unsigned GetX86RegNum(const MCOperand &MO) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+ }
+
+ // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+ // 0-7 and the difference between the 2 groups is given by the REX prefix.
+ // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+ // in 1's complement form, example:
+ //
+ // ModRM field => XMM9 => 1
+ // VEX.VVVV => XMM9 => ~9
+ //
+ // See table 4-35 of Intel AVX Programming Reference for details.
+ unsigned char getVEXRegisterEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ unsigned SrcReg = MI.getOperand(OpNum).getReg();
+ unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+ if (X86II::isX86_64ExtendedReg(SrcReg))
+ SrcRegNum |= 8;
+
+ // The registers represented through VEX_VVVV should
+ // be encoded in 1's complement form.
+ return (~SrcRegNum) & 0xf;
+ }
+
+ unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
+ "Invalid mask register as write-mask!");
+ unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
+ return MaskRegNum;
+ }
+
+ void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+ OS << (char)C;
+ ++CurByte;
+ }
+
+ void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+ raw_ostream &OS) const {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ EmitByte(Val & 255, CurByte, OS);
+ Val >>= 8;
+ }
+ }
+
+ void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+ unsigned ImmSize, MCFixupKind FixupKind,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ int ImmOffset = 0) const;
+
+ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+ }
+
+ void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+ unsigned &CurByte, raw_ostream &OS) const {
+ EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+ }
+
+ void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ unsigned &CurByte, raw_ostream &OS) const {
+ // SIB byte is in the same format as the ModRMByte.
+ EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+ }
+
+
+ void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ raw_ostream &OS) const;
+
+ void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+ const MCInst &MI, raw_ostream &OS) const;
+
+ void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new X86MCCodeEmitter(MCII, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+ assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+ "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+ unsigned CD8_Scale =
+ (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+ if (CD8_Scale == 0) {
+ CValue = Value;
+ return isDisp8(Value);
+ }
+
+ unsigned Mask = CD8_Scale - 1;
+ assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
+ if (Value & Mask) // Unaligned offset
+ return false;
+ Value /= (int)CD8_Scale;
+ bool Ret = (Value == (signed char)Value);
+
+ if (Ret)
+ CValue = Value;
+ return Ret;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+ unsigned Size = X86II::getSizeOfImm(TSFlags);
+ bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+ if (X86II::isImmSigned(TSFlags)) {
+ switch (Size) {
+ default: llvm_unreachable("Unsupported signed fixup size!");
+ case 4: return MCFixupKind(X86::reloc_signed_4byte);
+ }
+ }
+ return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+#endif
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+/// _GLOBAL_OFFSET_TABLE_ and if it is of the form
+/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+ GOT_None,
+ GOT_Normal,
+ GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+ const MCExpr *RHS = nullptr;
+ if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+ Expr = BE->getLHS();
+ RHS = BE->getRHS();
+ }
+
+ if (Expr->getKind() != MCExpr::SymbolRef)
+ return GOT_None;
+
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ const MCSymbol &S = Ref->getSymbol();
+ if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+ return GOT_None;
+ if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+ return GOT_SymDiff;
+ return GOT_Normal;
+}
+
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+ }
+ return false;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+ MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+ const MCExpr *Expr = nullptr;
+ if (DispOp.isImm()) {
+ // If this is a simple integer displacement that doesn't require a
+ // relocation, emit it now.
+ if (FixupKind != FK_PCRel_1 &&
+ FixupKind != FK_PCRel_2 &&
+ FixupKind != FK_PCRel_4) {
+ EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+ return;
+ }
+ Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
+ } else {
+ Expr = DispOp.getExpr();
+ }
+
+ // If we have an immoffset, add it to the expression.
+ if ((FixupKind == FK_Data_4 ||
+ FixupKind == FK_Data_8 ||
+ FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+ GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+ if (Kind != GOT_None) {
+ assert(ImmOffset == 0);
+
+ if (Size == 8) {
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+ } else {
+ assert(Size == 4);
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ }
+
+ if (Kind == GOT_Normal)
+ ImmOffset = CurByte;
+ } else if (Expr->getKind() == MCExpr::SymbolRef) {
+ if (HasSecRelSymbolRef(Expr)) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ } else if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+ if (HasSecRelSymbolRef(Bin->getLHS())
+ || HasSecRelSymbolRef(Bin->getRHS())) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ }
+ }
+
+ // If the fixup is pc-relative, we need to bias the value to be relative to
+ // the start of the field, not the end of the field.
+ if (FixupKind == FK_PCRel_4 ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+ ImmOffset -= 4;
+ if (FixupKind == FK_PCRel_2)
+ ImmOffset -= 2;
+ if (FixupKind == FK_PCRel_1)
+ ImmOffset -= 1;
+
+ if (ImmOffset)
+ Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
+ Ctx);
+
+ // Emit a symbolic constant as a fixup and 4 zeros.
+ Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
+ EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ uint64_t TSFlags, unsigned &CurByte,
+ raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const{
+ const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
+ const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+ unsigned BaseReg = Base.getReg();
+ bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
+ assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
+ assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+ unsigned FixupKind = X86::reloc_riprel_4byte;
+
+ // movq loads are handled with a special relocation form which allows the
+ // linker to eliminate some loads for GOT references which end up in the
+ // same linkage unit.
+ if (MI.getOpcode() == X86::MOV64rm)
+ FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+ // rip-relative addressing is actually relative to the *next* instruction.
+ // Since an immediate can follow the mod/rm byte for an instruction, this
+ // means that we need to bias the immediate field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
+ // expression to emit.
+ int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+ CurByte, OS, Fixups, -ImmSize);
+ return;
+ }
+
+ unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+ // 16-bit addressing forms of the ModR/M byte have a different encoding for
+ // the R/M field and are far more limited in which registers can be used.
+ if (Is16BitMemOperand(MI, Op, STI)) {
+ if (BaseReg) {
+ // For 32-bit addressing, the row and column values in Table 2-2 are
+ // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+ // some special cases. And GetX86RegNum reflects that numbering.
+ // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+ // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+ // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+ // while values 0-3 indicate the allowed combinations (base+index) of
+ // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+ //
+ // R16Table[] is a lookup from the normal RegNo, to the row values from
+ // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+ static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+ unsigned RMfield = R16Table[BaseRegNo];
+
+ assert(RMfield && "invalid 16-bit base register");
+
+ if (IndexReg.getReg()) {
+ unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+
+ assert(IndexReg16 && "invalid 16-bit index register");
+ // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+ assert(((IndexReg16 ^ RMfield) & 2) &&
+ "invalid 16-bit base/index register combination");
+ assert(Scale.getImm() == 1 &&
+ "invalid scale for 16-bit memory reference");
+
+ // Allow base/index to appear in either order (although GAS doesn't).
+ if (IndexReg16 & 2)
+ RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+ else
+ RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+ }
+
+ if (Disp.isImm() && isDisp8(Disp.getImm())) {
+ if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ // There is no displacement; just the register.
+ EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+ return;
+ }
+ // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+ EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+ // This is the [REG]+disp16 case.
+ EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+ } else {
+ // There is no BaseReg; this is the plain [disp16] case.
+ EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+ }
+
+ // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+ EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+ return;
+ }
+
+ // Determine whether a SIB byte is needed.
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+ // 2-7) and absolute references.
+
+ if (// The SIB byte must be used if there is an index register.
+ IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!is64BitMode(STI) || BaseReg != 0)) {
+
+ if (BaseReg == 0) { // [disp32] in X86-32 mode
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+ return;
+ }
+
+ // If the base is not EBP/ESP and there is no displacement, use simple
+ // indirect register encoding, this handles addresses like [EAX]. The
+ // encoding for [EBP] with no displacement means [disp32] so we handle it
+ // by emitting a displacement of 0 below.
+ if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ if (Disp.isImm()) {
+ if (!HasEVEX && isDisp8(Disp.getImm())) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+ // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+ // 32-bit displacement.
+ int CDisp8 = 0;
+ if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ CDisp8 - Disp.getImm());
+ return;
+ }
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ CurByte, OS, Fixups);
+ return;
+ }
+
+ // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ int CDisp8 = 0;
+ int ImmOffset = 0;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (!Disp.isImm()) {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (Disp.getImm() == 0 &&
+ // Base reg can't be anything that ends up with '5' as the base
+ // reg, it is the magic [*] nomenclature that indicates no base.
+ BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ } else if (!HasEVEX && isDisp8(Disp.getImm())) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ ImmOffset = CDisp8 - Disp.getImm();
+ } else {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base, see Intel
+ // Manual 2A, table 2-7. The displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+ IndexRegNo = 4;
+ EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+ } else {
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8)
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
+ else if (ForceDisp32 || Disp.getImm() != 0)
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ const MCInstrDesc &Desc,
+ raw_ostream &OS) const {
+ assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
+
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
+ bool HasMemOp4 = TSFlags & X86II::MemOp4;
+ bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+ // VEX_R: opcode externsion equivalent to REX.R in
+ // 1's complement (inverted) form
+ //
+ // 1: Same as REX_R=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX_R=1 (64 bit mode only)
+ //
+ unsigned char VEX_R = 0x1;
+ unsigned char EVEX_R2 = 0x1;
+
+ // VEX_X: equivalent to REX.X, only used when a
+ // register is used for index in SIB Byte.
+ //
+ // 1: Same as REX.X=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX.X=1 (64-bit mode only)
+ unsigned char VEX_X = 0x1;
+
+ // VEX_B:
+ //
+ // 1: Same as REX_B=0 (ignored in 32-bit mode)
+ // 0: Same as REX_B=1 (64 bit mode only)
+ //
+ unsigned char VEX_B = 0x1;
+
+ // VEX_W: opcode specific (use like REX.W, or used for
+ // opcode extension, or ignored, depending on the opcode byte)
+ unsigned char VEX_W = 0;
+
+ // VEX_5M (VEX m-mmmmm field):
+ //
+ // 0b00000: Reserved for future use
+ // 0b00001: implied 0F leading opcode
+ // 0b00010: implied 0F 38 leading opcode bytes
+ // 0b00011: implied 0F 3A leading opcode bytes
+ // 0b00100-0b11111: Reserved for future use
+ // 0b01000: XOP map select - 08h instructions with imm byte
+ // 0b01001: XOP map select - 09h instructions with no imm byte
+ // 0b01010: XOP map select - 0Ah instructions with imm dword
+ unsigned char VEX_5M = 0;
+
+ // VEX_4V (VEX vvvv field): a register specifier
+ // (in 1's complement form) or 1111 if unused.
+ unsigned char VEX_4V = 0xf;
+ unsigned char EVEX_V2 = 0x1;
+
+ // VEX_L (Vector Length):
+ //
+ // 0: scalar or 128-bit vector
+ // 1: 256-bit vector
+ //
+ unsigned char VEX_L = 0;
+ unsigned char EVEX_L2 = 0;
+
+ // VEX_PP: opcode extension providing equivalent
+ // functionality of a SIMD prefix
+ //
+ // 0b00: None
+ // 0b01: 66
+ // 0b10: F3
+ // 0b11: F2
+ //
+ unsigned char VEX_PP = 0;
+
+ // EVEX_U
+ unsigned char EVEX_U = 1; // Always '1' so far
+
+ // EVEX_z
+ unsigned char EVEX_z = 0;
+
+ // EVEX_b
+ unsigned char EVEX_b = 0;
+
+ // EVEX_rc
+ unsigned char EVEX_rc = 0;
+
+ // EVEX_aaa
+ unsigned char EVEX_aaa = 0;
+
+ bool EncodeRC = false;
+
+ if (TSFlags & X86II::VEX_W)
+ VEX_W = 1;
+
+ if (TSFlags & X86II::VEX_L)
+ VEX_L = 1;
+ if (TSFlags & X86II::EVEX_L2)
+ EVEX_L2 = 1;
+
+ if (HasEVEX_K && (TSFlags & X86II::EVEX_Z))
+ EVEX_z = 1;
+
+ if ((TSFlags & X86II::EVEX_B))
+ EVEX_b = 1;
+
+ switch (TSFlags & X86II::OpPrefixMask) {
+ default: break; // VEX_PP already correct
+ case X86II::PD: VEX_PP = 0x1; break; // 66
+ case X86II::XS: VEX_PP = 0x2; break; // F3
+ case X86II::XD: VEX_PP = 0x3; break; // F2
+ }
+
+ switch (TSFlags & X86II::OpMapMask) {
+ default: llvm_unreachable("Invalid prefix!");
+ case X86II::TB: VEX_5M = 0x1; break; // 0F
+ case X86II::T8: VEX_5M = 0x2; break; // 0F 38
+ case X86II::TA: VEX_5M = 0x3; break; // 0F 3A
+ case X86II::XOP8: VEX_5M = 0x8; break;
+ case X86II::XOP9: VEX_5M = 0x9; break;
+ case X86II::XOPA: VEX_5M = 0xA; break;
+ }
+
+ // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+ case X86II::RawFrm:
+ break;
+ case X86II::MRMDestMem: {
+ // MRMDestMem instructions forms:
+ // MemAddr, src1(ModR/M)
+ // MemAddr, src1(VEX_4V), src2(ModR/M)
+ // MemAddr, src1(ModR/M), imm8
+ //
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ EVEX_V2 = 0x0;
+
+ CurOp += X86::AddrNumOperands;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
+
+ const MCOperand &MO = MI.getOperand(CurOp);
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ VEX_R = 0x0;
+ if (X86II::is32ExtendedReg(MO.getReg()))
+ EVEX_R2 = 0x0;
+ }
+ break;
+ }
+ case X86II::MRMSrcMem:
+ // MRMSrcMem instructions forms:
+ // src1(ModR/M), MemAddr
+ // src1(ModR/M), src2(VEX_4V), MemAddr
+ // src1(ModR/M), MemAddr, imm8
+ // src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ EVEX_V2 = 0x0;
+
+ if (HasVEX_4VOp3)
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), MemAddr, src3(VEX_4V)
+ // CurOp points to start of the MemoryOperand,
+ // it skips TIED_TO operands if exist, then increments past src1.
+ // CurOp + X86::AddrNumOperands will point to src3.
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+ break;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ // MRM[0-9]m instructions forms:
+ // MemAddr
+ // src1(VEX_4V), MemAddr
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+ break;
+ }
+ case X86II::MRMSrcReg:
+ // MRMSrcReg instructions forms:
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M), src1(ModR/M)
+ // dst(ModR/M), src1(ModR/M), imm8
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
+
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ CurOp++;
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
+ CurOp++;
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (EVEX_b) {
+ if (HasEVEX_RC) {
+ unsigned RcOperand = NumOps-1;
+ assert(RcOperand >= CurOp);
+ EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+ }
+ EncodeRC = true;
+ }
+ break;
+ case X86II::MRMDestReg:
+ // MRMDestReg instructions forms:
+ // dst(ModR/M), src(ModR/M)
+ // dst(ModR/M), src(ModR/M), imm8
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
+ if (EVEX_b)
+ EncodeRC = true;
+ break;
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ // MRM0r-MRM7r instructions forms:
+ // dst(VEX_4V), src(ModR/M), imm8
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
+ break;
+ }
+
+ if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ // XOP uses a similar prefix:
+ // +-----+ +--------------+ +-------------------+
+ // | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+ // Can we use the 2 byte VEX prefix?
+ if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+ EmitByte(0xC5, CurByte, OS);
+ EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ return;
+ }
+
+ // 3 byte VEX prefix
+ EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
+ EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+ EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ } else {
+ assert(Encoding == X86II::EVEX && "unknown encoding!");
+ // EVEX opcode prefix can have 4 bytes
+ //
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ assert((VEX_5M & 0x3) == VEX_5M
+ && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+ VEX_5M &= 0x3;
+
+ EmitByte(0x62, CurByte, OS);
+ EmitByte((VEX_R << 7) |
+ (VEX_X << 6) |
+ (VEX_B << 5) |
+ (EVEX_R2 << 4) |
+ VEX_5M, CurByte, OS);
+ EmitByte((VEX_W << 7) |
+ (VEX_4V << 3) |
+ (EVEX_U << 2) |
+ VEX_PP, CurByte, OS);
+ if (EncodeRC)
+ EmitByte((EVEX_z << 7) |
+ (EVEX_rc << 5) |
+ (EVEX_b << 4) |
+ (EVEX_V2 << 3) |
+ EVEX_aaa, CurByte, OS);
+ else
+ EmitByte((EVEX_z << 7) |
+ (EVEX_L2 << 6) |
+ (VEX_L << 5) |
+ (EVEX_b << 4) |
+ (EVEX_V2 << 3) |
+ EVEX_aaa, CurByte, OS);
+ }
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+ const MCInstrDesc &Desc) {
+ unsigned REX = 0;
+ bool UsesHighByteReg = false;
+
+ if (TSFlags & X86II::REX_W)
+ REX |= 1 << 3; // set REX.W
+
+ if (MI.getNumOperands() == 0) return REX;
+
+ unsigned NumOps = MI.getNumOperands();
+ // FIXME: MCInst should explicitize the two-addrness.
+ bool isTwoAddr = NumOps > 1 &&
+ Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (!MO.isReg()) continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ UsesHighByteReg = true;
+ if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
+ // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+ // that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ break;
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMSrcReg:
+ if (MI.getOperand(0).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 2; // set REX.R
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << 0; // set REX.B
+ }
+ break;
+ case X86II::MRMSrcMem: {
+ if (MI.getOperand(0).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 2; // set REX.R
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && MI.getOperand(e).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+ REX |= 1 << 2; // set REX.R
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
+ Bit++;
+ }
+ }
+ break;
+ }
+ default:
+ if (MI.getOperand(0).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 0; // set REX.B
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << 2; // set REX.R
+ }
+ break;
+ }
+ if (REX && UsesHighByteReg)
+ report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+
+ return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+ unsigned SegOperand,
+ const MCInst &MI,
+ raw_ostream &OS) const {
+ // Check for explicit segment override on memory operand.
+ switch (MI.getOperand(SegOperand).getReg()) {
+ default: llvm_unreachable("Unknown segment register!");
+ case 0: break;
+ case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+ case X86::SS: EmitByte(0x36, CurByte, OS); break;
+ case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+ case X86::ES: EmitByte(0x26, CurByte, OS); break;
+ case X86::FS: EmitByte(0x64, CurByte, OS); break;
+ case X86::GS: EmitByte(0x65, CurByte, OS); break;
+ }
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present. If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ const MCInstrDesc &Desc,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+
+ // Emit the operand size opcode prefix as needed.
+ if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
+ : X86II::OpSize16))
+ EmitByte(0x66, CurByte, OS);
+
+ // Emit the LOCK opcode prefix.
+ if (TSFlags & X86II::LOCK)
+ EmitByte(0xF0, CurByte, OS);
+
+ switch (TSFlags & X86II::OpPrefixMask) {
+ case X86II::PD: // 66
+ EmitByte(0x66, CurByte, OS);
+ break;
+ case X86II::XS: // F3
+ EmitByte(0xF3, CurByte, OS);
+ break;
+ case X86II::XD: // F2
+ EmitByte(0xF2, CurByte, OS);
+ break;
+ }
+
+ // Handle REX prefix.
+ // FIXME: Can this come before F2 etc to simplify emission?
+ if (is64BitMode(STI)) {
+ if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+ EmitByte(0x40 | REX, CurByte, OS);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ switch (TSFlags & X86II::OpMapMask) {
+ case X86II::TB: // Two-byte opcode map
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ EmitByte(0x0F, CurByte, OS);
+ break;
+ }
+
+ switch (TSFlags & X86II::OpMapMask) {
+ case X86II::T8: // 0F 38
+ EmitByte(0x38, CurByte, OS);
+ break;
+ case X86II::TA: // 0F 3A
+ EmitByte(0x3A, CurByte, OS);
+ break;
+ }
+}
+
+void X86MCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Pseudo instructions don't get encoded.
+ if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return;
+
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // Keep track of the current byte being emitted.
+ unsigned CurByte = 0;
+
+ // Encoding type for this instruction.
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+
+ // It uses the VEX.VVVV field?
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
+ bool HasMemOp4 = TSFlags & X86II::MemOp4;
+ const unsigned MemOp4_I8IMMOperand = 2;
+
+ // It uses the EVEX.aaa field?
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+ bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+ if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+ // Emit segment override opcode prefix as needed.
+ if (MemoryOperand >= 0)
+ EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
+ MI, OS);
+
+ // Emit the repeat opcode prefix as needed.
+ if (TSFlags & X86II::REP)
+ EmitByte(0xF3, CurByte, OS);
+
+ // Emit the address size opcode prefix as needed.
+ bool need_address_override;
+ uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+ if ((is16BitMode(STI) && AdSize == X86II::AdSize32) ||
+ (is32BitMode(STI) && AdSize == X86II::AdSize16) ||
+ (is64BitMode(STI) && AdSize == X86II::AdSize32)) {
+ need_address_override = true;
+ } else if (MemoryOperand < 0) {
+ need_address_override = false;
+ } else if (is64BitMode(STI)) {
+ assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
+ need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+ } else if (is32BitMode(STI)) {
+ assert(!Is64BitMemOperand(MI, MemoryOperand));
+ need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
+ } else {
+ assert(is16BitMode(STI));
+ assert(!Is64BitMemOperand(MI, MemoryOperand));
+ need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
+ }
+
+ if (need_address_override)
+ EmitByte(0x67, CurByte, OS);
+
+ if (Encoding == 0)
+ EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+ else
+ EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+ unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+ if (TSFlags & X86II::Has3DNow0F0FOpcode)
+ BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+
+ unsigned SrcRegNum = 0;
+ switch (TSFlags & X86II::FormMask) {
+ default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+ llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+ case X86II::Pseudo:
+ llvm_unreachable("Pseudo instruction shouldn't be emitted");
+ case X86II::RawFrmDstSrc: {
+ unsigned siReg = MI.getOperand(1).getReg();
+ assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+ (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+ (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+ "SI and DI register sizes do not match");
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(2).getReg() != X86::DS)
+ EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+ (is32BitMode(STI) && siReg == X86::SI))
+ EmitByte(0x67, CurByte, OS);
+ CurOp += 3; // Consume operands.
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ }
+ case X86II::RawFrmSrc: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(1).getReg() != X86::DS)
+ EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+ (is32BitMode(STI) && siReg == X86::SI))
+ EmitByte(0x67, CurByte, OS);
+ CurOp += 2; // Consume operands.
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ }
+ case X86II::RawFrmDst: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit AdSize prefix as needed.
+ if ((!is32BitMode(STI) && siReg == X86::EDI) ||
+ (is32BitMode(STI) && siReg == X86::DI))
+ EmitByte(0x67, CurByte, OS);
+ ++CurOp; // Consume operand.
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ }
+ case X86II::RawFrm:
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ case X86II::RawFrmMemOffs:
+ // Emit segment override opcode prefix as needed.
+ EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ ++CurOp; // skip segment operand
+ break;
+ case X86II::RawFrmImm8:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+ OS, Fixups);
+ break;
+ case X86II::RawFrmImm16:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+ OS, Fixups);
+ break;
+
+ case X86II::AddRegFrm:
+ EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+ break;
+
+ case X86II::MRMDestReg:
+ EmitByte(BaseOpcode, CurByte, OS);
+ SrcRegNum = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ EmitRegModRMByte(MI.getOperand(CurOp),
+ GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ break;
+
+ case X86II::MRMDestMem:
+ EmitByte(BaseOpcode, CurByte, OS);
+ SrcRegNum = CurOp + X86::AddrNumOperands;
+
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ EmitMemModRMByte(MI, CurOp,
+ GetX86RegNum(MI.getOperand(SrcRegNum)),
+ TSFlags, CurByte, OS, Fixups, STI);
+ CurOp = SrcRegNum + 1;
+ break;
+
+ case X86II::MRMSrcReg:
+ EmitByte(BaseOpcode, CurByte, OS);
+ SrcRegNum = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+ ++SrcRegNum;
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+
+ // 2 operands skipped with HasMemOp4, compensate accordingly
+ CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
+ // do not count the rounding control operand
+ if (HasEVEX_RC)
+ NumOps--;
+ break;
+
+ case X86II::MRMSrcMem: {
+ int AddrOperands = X86::AddrNumOperands;
+ unsigned FirstMemOp = CurOp+1;
+
+ if (HasEVEX_K) { // Skip writemask
+ ++AddrOperands;
+ ++FirstMemOp;
+ }
+
+ if (HasVEX_4V) {
+ ++AddrOperands;
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+ }
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ ++FirstMemOp;
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, CurByte, OS, Fixups, STI);
+ CurOp += AddrOperands + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
+ break;
+ }
+
+ case X86II::MRMXr:
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
+ EmitByte(BaseOpcode, CurByte, OS);
+ uint64_t Form = TSFlags & X86II::FormMask;
+ EmitRegModRMByte(MI.getOperand(CurOp++),
+ (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
+ CurByte, OS);
+ break;
+ }
+
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
+ EmitByte(BaseOpcode, CurByte, OS);
+ uint64_t Form = TSFlags & X86II::FormMask;
+ EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
+ TSFlags, CurByte, OS, Fixups, STI);
+ CurOp += X86::AddrNumOperands;
+ break;
+ }
+ case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+ case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+ case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+ case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+ case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+ case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+ case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+ case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+ case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+ case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+ case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+ case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+ case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+ case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+ case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+ case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+ case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_FF:
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ uint64_t Form = TSFlags & X86II::FormMask;
+ EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+ break;
+ }
+
+ // If there is a remaining operand, it must be a trailing immediate. Emit it
+ // according to the right size for the instruction. Some instructions
+ // (SSE4a extrq and insertq) have two trailing immediates.
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
+ // The last source register of a 4 operand instruction in AVX is encoded
+ // in bits[7:4] of a immediate byte.
+ if (TSFlags & X86II::VEX_I8IMM) {
+ const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+ : CurOp);
+ ++CurOp;
+ unsigned RegNum = GetX86RegNum(MO) << 4;
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ RegNum |= 1 << 7;
+ // If there is an additional 5th operand it must be an immediate, which
+ // is encoded in bits[3:0]
+ if (CurOp != NumOps) {
+ const MCOperand &MIMM = MI.getOperand(CurOp++);
+ if (MIMM.isImm()) {
+ unsigned Val = MIMM.getImm();
+ assert(Val < 16 && "Immediate operand value out of range");
+ RegNum |= Val;
+ }
+ }
+ EmitImmediate(MCOperand::createImm(RegNum), MI.getLoc(), 1, FK_Data_1,
+ CurByte, OS, Fixups);
+ } else {
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ }
+ }
+
+ if (TSFlags & X86II::Has3DNow0F0FOpcode)
+ EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+ // FIXME: Verify.
+ if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+ errs() << "Cannot encode all operands of: ";
+ MI.dump();
+ errs() << '\n';
+ abort();
+ }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 0000000..53a6550
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,451 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+std::string X86_MC::ParseX86Triple(const Triple &TT) {
+ std::string FS;
+ if (TT.getArch() == Triple::x86_64)
+ FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+ else if (TT.getEnvironment() != Triple::CODE16)
+ FS = "-64bit-mode,+32bit-mode,-16bit-mode";
+ else
+ FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
+ return FS;
+}
+
+unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
+ if (TT.getArch() == Triple::x86_64)
+ return DWARFFlavour::X86_64;
+
+ if (TT.isOSDarwin())
+ return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+ if (TT.isOSCygMing())
+ // Unsupported by now, just quick fallback
+ return DWARFFlavour::X86_32_Generic;
+ return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+ // FIXME: TableGen these.
+ for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+ unsigned SEH = MRI->getEncodingValue(Reg);
+ MRI->mapLLVMRegToSEHReg(Reg, SEH);
+ }
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ std::string ArchFS = X86_MC::ParseX86Triple(TT);
+ if (!FS.empty()) {
+ if (!ArchFS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
+ else
+ ArchFS = FS;
+ }
+
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitX86MCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+ unsigned RA = (TT.getArch() == Triple::x86_64)
+ ? X86::RIP // Should have dwarf #16.
+ : X86::EIP; // Should have dwarf #8.
+
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true), RA);
+ X86_MC::InitLLVM2SEHRegisterMapping(X);
+ return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TheTriple) {
+ bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSBinFormatMachO()) {
+ if (is64Bit)
+ MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+ else
+ MAI = new X86MCAsmInfoDarwin(TheTriple);
+ } else if (TheTriple.isOSBinFormatELF()) {
+ // Force the use of an ELF container.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ } else if (TheTriple.isWindowsMSVCEnvironment() ||
+ TheTriple.isWindowsCoreCLREnvironment()) {
+ MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+ } else if (TheTriple.isOSCygMing() ||
+ TheTriple.isWindowsItaniumEnvironment()) {
+ MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+ } else {
+ // The default is ELF.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ }
+
+ // Initialize initial frame state.
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = is64Bit ? -8 : -4;
+
+ // Initial state of the frame pointer is esp+stackGrowth.
+ unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+ nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+ MAI->addInitialFrameState(Inst);
+
+ // Add return address to move list
+ unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+ MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+ nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+ MAI->addInitialFrameState(Inst2);
+
+ return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL) {
+ MCCodeGenInfo *X = new MCCodeGenInfo();
+
+ bool is64Bit = TT.getArch() == Triple::x86_64;
+
+ if (RM == Reloc::Default) {
+ // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+ // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+ // use static relocation model by default.
+ if (TT.isOSDarwin()) {
+ if (is64Bit)
+ RM = Reloc::PIC_;
+ else
+ RM = Reloc::DynamicNoPIC;
+ } else if (TT.isOSWindows() && is64Bit)
+ RM = Reloc::PIC_;
+ else
+ RM = Reloc::Static;
+ }
+
+ // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
+ // is defined as a model for code which may be used in static or dynamic
+ // executables but not necessarily a shared library. On X86-32 we just
+ // compile in -static mode, in x86-64 we use PIC.
+ if (RM == Reloc::DynamicNoPIC) {
+ if (is64Bit)
+ RM = Reloc::PIC_;
+ else if (!TT.isOSDarwin())
+ RM = Reloc::Static;
+ }
+
+ // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+ // the Mach-O file format doesn't support it.
+ if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
+ RM = Reloc::PIC_;
+
+ // For static codegen, if we're not already set, use Small codegen.
+ if (CM == CodeModel::Default)
+ CM = CodeModel::Small;
+ else if (CM == CodeModel::JITDefault)
+ // 64-bit JIT places everything in the same buffer except external funcs.
+ CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+ X->initMCCodeGenInfo(RM, CM, OL);
+ return X;
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new X86ATTInstPrinter(MAI, MII, MRI);
+ if (SyntaxVariant == 1)
+ return new X86IntelInstPrinter(MAI, MII, MRI);
+ return nullptr;
+}
+
+static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
+ MCContext &Ctx) {
+ if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64)
+ return createX86_64MachORelocationInfo(Ctx);
+ else if (TheTriple.isOSBinFormatELF())
+ return createX86_64ELFRelocationInfo(Ctx);
+ // Default to the stock relocation info.
+ return llvm::createMCRelocationInfo(TheTriple, Ctx);
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+ return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+ for (Target *T : {&TheX86_32Target, &TheX86_64Target}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
+
+ // Register the MC codegen info.
+ RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T,
+ X86_MC::createX86MCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis);
+
+ // Register the code emitter.
+ TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter);
+
+ // Register the object streamer.
+ TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo);
+ }
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+ createX86_32AsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+ createX86_64AsmBackend);
+}
+
+unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
+ bool High) {
+ switch (Size) {
+ default: return 0;
+ case 8:
+ if (High) {
+ switch (Reg) {
+ default: return getX86SubSuperRegisterOrZero(Reg, 64);
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case 16:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case 32:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case 64:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+}
+
+unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
+ unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+ assert(Res != 0 && "Unexpected register or VT");
+ return Res;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 0000000..2d2836f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,129 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCRelocationInfo;
+class MCStreamer;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+/// Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+ enum {
+ X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+ };
+}
+
+/// Native X86 register numbers
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+namespace X86_MC {
+std::string ParseX86Triple(const Triple &TT);
+
+unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
+
+void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
+ StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const Triple &TT, StringRef CPU);
+
+/// Construct an X86 Windows COFF machine code streamer which will generate
+/// PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+ raw_pwrite_stream &OS, MCCodeEmitter *CE,
+ bool RelaxAll, bool IncrementalLinkerCompatible);
+
+/// Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
+
+/// Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
+ uint8_t OSABI, uint16_t EMachine);
+/// Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit);
+
+/// Construct X86-64 Mach-O relocation info.
+MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
+
+/// Construct X86-64 ELF relocation info.
+MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
+ bool High = false);
+
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
new file mode 100644
index 0000000..9bfe999
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -0,0 +1,119 @@
+//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/MachO.h"
+
+using namespace llvm;
+using namespace object;
+using namespace MachO;
+
+namespace {
+class X86_64MachORelocationInfo : public MCRelocationInfo {
+public:
+ X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+ const MCExpr *createExprForRelocation(RelocationRef Rel) override {
+ const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObject());
+
+ uint64_t RelType = Rel.getType();
+ symbol_iterator SymI = Rel.getSymbol();
+
+ ErrorOr<StringRef> SymNameOrErr = SymI->getName();
+ if (std::error_code EC = SymNameOrErr.getError())
+ report_fatal_error(EC.message());
+ StringRef SymName = *SymNameOrErr;
+ uint64_t SymAddr = SymI->getValue();
+
+ any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+ bool isPCRel = Obj->getAnyRelocationPCRel(RE);
+
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
+ // FIXME: check that the value is actually the same.
+ if (!Sym->isVariable())
+ Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
+ const MCExpr *Expr = nullptr;
+
+ switch(RelType) {
+ case X86_64_RELOC_TLV:
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+ break;
+ case X86_64_RELOC_SIGNED_4:
+ Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+ MCConstantExpr::create(4, Ctx),
+ Ctx);
+ break;
+ case X86_64_RELOC_SIGNED_2:
+ Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+ MCConstantExpr::create(2, Ctx),
+ Ctx);
+ break;
+ case X86_64_RELOC_SIGNED_1:
+ Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+ MCConstantExpr::create(1, Ctx),
+ Ctx);
+ break;
+ case X86_64_RELOC_GOT_LOAD:
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ break;
+ case X86_64_RELOC_GOT:
+ Expr = MCSymbolRefExpr::create(Sym, isPCRel ?
+ MCSymbolRefExpr::VK_GOTPCREL :
+ MCSymbolRefExpr::VK_GOT,
+ Ctx);
+ break;
+ case X86_64_RELOC_SUBTRACTOR:
+ {
+ Rel.moveNext();
+ any_relocation_info RENext =
+ Obj->getRelocation(Rel.getRawDataRefImpl());
+
+ // X86_64_SUBTRACTOR must be followed by a relocation of type
+ // X86_64_RELOC_UNSIGNED.
+ // NOTE: Scattered relocations don't exist on x86_64.
+ unsigned RType = Obj->getAnyRelocationType(RENext);
+ if (RType != X86_64_RELOC_UNSIGNED)
+ report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+ "X86_64_RELOC_SUBTRACTOR.");
+
+ const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx);
+
+ symbol_iterator RSymI = Rel.getSymbol();
+ uint64_t RSymAddr = RSymI->getValue();
+ ErrorOr<StringRef> RSymName = RSymI->getName();
+ if (std::error_code EC = RSymName.getError())
+ report_fatal_error(EC.message());
+
+ MCSymbol *RSym = Ctx.getOrCreateSymbol(*RSymName);
+ if (!RSym->isVariable())
+ RSym->setVariableValue(MCConstantExpr::create(RSymAddr, Ctx));
+
+ const MCExpr *RHS = MCSymbolRefExpr::create(RSym, Ctx);
+
+ Expr = MCBinaryExpr::createSub(LHS, RHS, Ctx);
+ break;
+ }
+ default:
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ break;
+ }
+ return Expr;
+ }
+};
+} // End unnamed namespace
+
+/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
+ return new X86_64MachORelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 0000000..191ebea
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,605 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+ bool recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue);
+ void recordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+
+ void RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+ void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup,
+ MCValue Target, uint64_t &FixedValue);
+
+public:
+ X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+ void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override {
+ if (Writer->is64Bit())
+ RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ else
+ RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+ return Kind == X86::reloc_riprel_4byte ||
+ Kind == X86::reloc_riprel_4byte_movq_load;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_Data_1: return 0;
+ case FK_PCRel_2:
+ case FK_Data_2: return 1;
+ case FK_PCRel_4:
+ // FIXME: Remove these!!!
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case FK_Data_4: return 2;
+ case FK_Data_8: return 3;
+ }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(
+ MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // See <reloc.h>.
+ uint32_t FixupOffset =
+ Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ int64_t Value = 0;
+ unsigned Index = 0;
+ unsigned IsExtern = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ Value = Target.getConstant();
+
+ if (IsPCRel) {
+ // Compensate for the relocation offset, Darwin x86_64 relocations only have
+ // the addend and appear to have attempted to define it to be the actual
+ // expression addend without the PCrel bias. However, instructions with data
+ // following the relocation are not accommodated for (see comment below
+ // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+ Value += 1LL << Log2Size;
+ }
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+
+ // FIXME: I believe this is broken, I don't think the linker can understand
+ // it. I think it would require a local relocation, but I'm not sure if that
+ // would work either. The official way to get an absolute PCrel relocation
+ // is to use an absolute symbol (which we don't support yet).
+ if (IsPCRel) {
+ IsExtern = 1;
+ Type = MachO::X86_64_RELOC_BRANCH;
+ }
+ } else if (Target.getSymB()) { // A - B + constant
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ if (A->isTemporary())
+ A = &Writer->findAliasedSymbol(*A);
+ const MCSymbol *A_Base = Asm.getAtom(*A);
+
+ const MCSymbol *B = &Target.getSymB()->getSymbol();
+ if (B->isTemporary())
+ B = &Writer->findAliasedSymbol(*B);
+ const MCSymbol *B_Base = Asm.getAtom(*B);
+
+ // Neither symbol can be modified.
+ if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
+
+ // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+ // implement most of these correctly.
+ if (IsPCRel) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+ return;
+ }
+
+ // The support for the situation where one or both of the symbols would
+ // require a local relocation is handled just like if the symbols were
+ // external. This is certainly used in the case of debug sections where the
+ // section has only temporary symbols and thus the symbols don't have base
+ // symbols. This is encoded using the section ordinal and non-extern
+ // relocation entries.
+
+ // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+ // single SIGNED relocation); reject it for now. Except the case where both
+ // symbols don't have a base, equal but both NULL.
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
+
+ // A subtraction expression where either symbol is undefined is a
+ // non-relocatable expression.
+ if (A->isUndefined() || B->isUndefined()) {
+ StringRef Name = A->isUndefined() ? A->getName() : B->getName();
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation with subtraction expression, symbol '" +
+ Name + "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ Value += Writer->getSymbolAddress(*A, Layout) -
+ (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout));
+ Value -= Writer->getSymbolAddress(*B, Layout) -
+ (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout));
+
+ if (!A_Base)
+ Index = A->getFragment()->getParent()->getOrdinal() + 1;
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+ if (B_Base)
+ RelSymbol = B_Base;
+ else
+ Index = B->getFragment()->getParent()->getOrdinal() + 1;
+ Type = MachO::X86_64_RELOC_SUBTRACTOR;
+ } else {
+ const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+ if (Symbol->isTemporary() && Value) {
+ const MCSection &Sec = Symbol->getSection();
+ if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+ Symbol->setUsedInReloc();
+ }
+ RelSymbol = Asm.getAtom(*Symbol);
+
+ // Relocations inside debug sections always use local relocations when
+ // possible. This seems to be done because the debugger doesn't fully
+ // understand x86_64 relocation entries, and expects to find values that
+ // have already been fixed up.
+ if (Symbol->isInSection()) {
+ const MCSectionMachO &Section =
+ static_cast<const MCSectionMachO &>(*Fragment->getParent());
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ RelSymbol = nullptr;
+ }
+
+ // x86_64 almost always uses external relocations, except when there is no
+ // symbol to use as a base address (a local symbol with no preceding
+ // non-local symbol).
+ if (RelSymbol) {
+ // Add the local offset, if needed.
+ if (RelSymbol != Symbol)
+ Value += Layout.getSymbolOffset(*Symbol) -
+ Layout.getSymbolOffset(*RelSymbol);
+ } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+ // The index is the section ordinal (1-based).
+ Index = Symbol->getFragment()->getParent()->getOrdinal() + 1;
+ Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+ if (IsPCRel)
+ Value -= FixupAddress + (1 << Log2Size);
+ } else if (Symbol->isVariable()) {
+ const MCExpr *Value = Symbol->getVariableValue();
+ int64_t Res;
+ bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
+ Writer->getSectionAddressMap());
+ if (isAbs) {
+ FixedValue = Res;
+ return;
+ } else {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
+ } else {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+ Symbol->getName() + "'");
+ return;
+ }
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+ if (IsPCRel) {
+ if (IsRIPRel) {
+ if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+ // rewrite the movq to an leaq at link time if the symbol ends up in
+ // the same linkage unit.
+ if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+ Type = MachO::X86_64_RELOC_GOT_LOAD;
+ else
+ Type = MachO::X86_64_RELOC_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Type = MachO::X86_64_RELOC_TLV;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
+ Type = MachO::X86_64_RELOC_SIGNED;
+
+ // The Darwin x86_64 relocation format has a problem where it cannot
+ // encode an address (L<foo> + <constant>) which is outside the atom
+ // containing L<foo>. Generally, this shouldn't occur but it does
+ // happen when we have a RIPrel instruction with data following the
+ // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+ // adjustment Darwin x86_64 uses, the offset is still negative and the
+ // linker has no way to recognize this.
+ //
+ // To work around this, Darwin uses several special relocation types
+ // to indicate the offsets. However, the specification or
+ // implementation of these seems to also be incomplete; they should
+ // adjust the addend as well based on the actual encoded instruction
+ // (the additional bias), but instead appear to just look at the final
+ // offset.
+ switch (-(Target.getConstant() + (1LL << Log2Size))) {
+ case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+ case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+ case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
+ }
+ }
+ } else {
+ if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported symbol modifier in branch relocation");
+ return;
+ }
+
+ Type = MachO::X86_64_RELOC_BRANCH;
+ }
+ } else {
+ if (Modifier == MCSymbolRefExpr::VK_GOT) {
+ Type = MachO::X86_64_RELOC_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+ // case all we do is set the PCrel bit in the relocation entry; this is
+ // used with exception handling, for example. The source is required to
+ // include any necessary offset directly.
+ Type = MachO::X86_64_RELOC_GOT;
+ IsPCRel = 1;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+ return;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+ unsigned Kind = Fixup.getKind();
+ if (Kind == X86::reloc_signed_4byte) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "32-bit absolute addressing is not supported in 64-bit mode");
+ return;
+ }
+ }
+ }
+ }
+
+ // x86_64 always writes custom values into the fixups.
+ FixedValue = Value;
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue) {
+ uint64_t OriginalFixedValue = FixedValue;
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
+
+ uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+ uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+ FixedValue += SecAddr;
+ uint32_t Value2 = 0;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ const MCSymbol *SB = &B->getSymbol();
+
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
+
+ // Select the appropriate difference relocation type.
+ //
+ // Note that there is no longer any semantic difference between these two
+ // relocation types from the linkers point of view, this is done solely for
+ // pedantic compatibility with 'as'.
+ Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
+ : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+ Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+ FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+ }
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+ Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+ // If the offset is too large to fit in a scattered relocation,
+ // we're hosed. It's an unfortunate limitation of the MachO format.
+ if (FixupOffset > 0xffffff) {
+ char Buffer[32];
+ format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+ Asm.getContext().reportError(Fixup.getLoc(),
+ Twine("Section too large, can't encode "
+ "r_address (") + Buffer +
+ ") into 24 bits of scattered "
+ "relocation entry.");
+ return false;
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((0 << 0) | // r_address
+ (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value2;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ } else {
+ // If the offset is more than 24-bits, it won't fit in a scattered
+ // relocation offset field, so we fall back to using a non-scattered
+ // relocation. This is a bit risky, as if the offset reaches out of
+ // the block and the linker is doing scattered loading on this
+ // symbol, things can go badly.
+ //
+ // Required for 'as' compatibility.
+ if (FixupOffset > 0xffffff) {
+ FixedValue = OriginalFixedValue;
+ return false;
+ }
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((FixupOffset << 0) |
+ (Type << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ return true;
+}
+
+void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+ !is64Bit() &&
+ "Should only be called with a 32-bit TLVP relocation!");
+
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+ uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = 0;
+
+ // We're only going to have a second symbol in pic mode and it'll be a
+ // subtraction from the picbase. For 32-bit pic the addend is the difference
+ // between the picbase and the next address. For 32-bit static the addend is
+ // zero.
+ if (Target.getSymB()) {
+ // If this is a subtraction then we're pcrel.
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ IsPCRel = 1;
+ FixedValue =
+ FixupAddress -
+ Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
+ Target.getConstant();
+ FixedValue += 1ULL << Log2Size;
+ } else {
+ FixedValue = 0;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = Value;
+ MRE.r_word1 =
+ (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
+ Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
+ MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // If this is a 32-bit TLVP reloc it's handled a bit differently.
+ if (Target.getSymA() &&
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+ recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ return;
+ }
+
+ // If this is a difference or a defined symbol plus an offset, then we need a
+ // scattered relocation entry. Differences always require scattered
+ // relocations.
+ if (Target.getSymB()) {
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, Log2Size, FixedValue);
+ return;
+ }
+
+ // Get the symbol data, if any.
+ const MCSymbol *A = nullptr;
+ if (Target.getSymA())
+ A = &Target.getSymA()->getSymbol();
+
+ // If this is an internal relocation with an offset, it also needs a scattered
+ // relocation entry.
+ uint32_t Offset = Target.getConstant();
+ if (IsPCRel)
+ Offset += 1 << Log2Size;
+ // Try to record the scattered relocation if needed. Fall back to non
+ // scattered if necessary (see comments in recordScatteredRelocation()
+ // for details).
+ if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ Log2Size, FixedValue))
+ return;
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned Index = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ //
+ // FIXME: Currently, these are never generated (see code below). I cannot
+ // find a case where they are actually emitted.
+ Type = MachO::GENERIC_RELOC_VANILLA;
+ } else {
+ // Resolve constant variables.
+ if (A->isVariable()) {
+ int64_t Res;
+ if (A->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+
+ // Check whether we need an external or internal relocation.
+ if (Writer->doesSymbolRequireExternRelocation(*A)) {
+ RelSymbol = A;
+ // For external relocations, make sure to offset the fixup value to
+ // compensate for the addend of the symbol address, if it was
+ // undefined. This occurs with weak definitions, for example.
+ if (!A->isUndefined())
+ FixedValue -= Layout.getSymbolOffset(*A);
+ } else {
+ // The index is the section ordinal (1-based).
+ const MCSection &Sec = A->getSection();
+ Index = Sec.getOrdinal() + 1;
+ FixedValue += Writer->getSectionAddress(&Sec);
+ }
+ if (IsPCRel)
+ FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+ Type = MachO::GENERIC_RELOC_VANILLA;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+ CPUType,
+ CPUSubtype),
+ OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..bd1bc99
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,97 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+ class MCObjectWriter;
+}
+
+namespace {
+ class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+ public:
+ X86WinCOFFObjectWriter(bool Is64Bit);
+ ~X86WinCOFFObjectWriter() override;
+
+ unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const override;
+ };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+ : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+ : COFF::IMAGE_FILE_MACHINE_I386) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const {
+ unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+ if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return COFF::IMAGE_REL_AMD64_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_AMD64_ADDR32NB;
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ case FK_Data_8:
+ return COFF::IMAGE_REL_AMD64_ADDR64;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_AMD64_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+ } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return COFF::IMAGE_REL_I386_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_I386_DIR32NB;
+ return COFF::IMAGE_REL_I386_DIR32;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_I386_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_I386_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+ } else
+ llvm_unreachable("Unsupported COFF machine type.");
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit) {
+ MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+ return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 0000000..d045118
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,60 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+ Win64EH::UnwindEmitter EHStreamer;
+public:
+ X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+ raw_pwrite_stream &OS)
+ : MCWinCOFFStreamer(C, AB, *CE, OS) {}
+
+ void EmitWinEHHandlerData() override;
+ void EmitWindowsUnwindTables() override;
+ void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+ MCStreamer::EmitWinEHHandlerData();
+
+ // We have to emit the unwind info now, because this directive
+ // actually switches to the .xdata section!
+ EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+ if (!getNumWinFrameInfos())
+ return;
+ EHStreamer.Emit(*this);
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+ EmitFrames(nullptr);
+ EmitWindowsUnwindTables();
+
+ MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *CE, bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
+ X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+ S->getAssembler().setRelaxAll(RelaxAll);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+ return S;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..fceb083
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,22 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() {
+ RegisterTarget<Triple::x86, /*HasJIT=*/true>
+ X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+ RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+ Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 0000000..619f7c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,464 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ // Defaults the copying the dest value.
+ ShuffleMask.push_back(0);
+ ShuffleMask.push_back(1);
+ ShuffleMask.push_back(2);
+ ShuffleMask.push_back(3);
+
+ // Decode the immediate.
+ unsigned ZMask = Imm & 15;
+ unsigned CountD = (Imm >> 4) & 3;
+ unsigned CountS = (Imm >> 6) & 3;
+
+ // CountS selects which input element to use.
+ unsigned InVal = 4 + CountS;
+ // CountD specifies which element of destination to update.
+ ShuffleMask[CountD] = InVal;
+ // ZMask zaps values, potentially overriding the CountD elt.
+ if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+ if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+ if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+ if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = NElts / 2; i != NElts; ++i)
+ ShuffleMask.push_back(NElts + i);
+
+ for (unsigned i = NElts / 2; i != NElts; ++i)
+ ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != NElts / 2; ++i)
+ ShuffleMask.push_back(i);
+
+ for (unsigned i = 0; i != NElts / 2; ++i)
+ ShuffleMask.push_back(NElts + i);
+}
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i);
+ ShuffleMask.push_back(2 * i);
+ }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i + 1);
+ ShuffleMask.push_back(2 * i + 1);
+ }
+}
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
+ for (unsigned s = 0; s != NumLaneSubElts; s++)
+ ShuffleMask.push_back(l + s);
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned NumElts = VectorSizeInBits / 8;
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ int M = SM_SentinelZero;
+ if (i >= Imm) M = i - Imm + l;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned NumElts = VectorSizeInBits / 8;
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ unsigned Base = i + Imm;
+ int M = Base + l;
+ if (Base >= NumLaneElts) M = SM_SentinelZero;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + l);
+ NewImm /= NumLaneElts;
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + 4 + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ }
+}
+
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumHalfElts = NumElts / 2;
+
+ for (unsigned l = 0; l != NumHalfElts; ++l)
+ ShuffleMask.push_back(l + NumHalfElts);
+ for (unsigned h = 0; h != NumHalfElts; ++h)
+ ShuffleMask.push_back(h);
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ // each half of a lane comes from different source
+ for (unsigned s = 0; s != NumElts * 2; s += NumElts) {
+ for (unsigned i = 0; i != NumLaneElts / 2; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+ NewImm /= NumLaneElts;
+ }
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+ }
+ }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+ }
+ }
+}
+
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+ unsigned ControlBitsMask = NumLanes - 1;
+ unsigned NumControlBits = NumLanes / 2;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+ // We actually need the other source.
+ if (l >= NumLanes / 2)
+ LaneMask += NumLanes;
+ for (unsigned i = 0; i != NumElementsInLane; ++i)
+ ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+ }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned HalfSize = VT.getVectorNumElements() / 2;
+
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned HalfMask = Imm >> (l * 4);
+ unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
+ for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
+ ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
+ }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ if (M == (uint64_t)SM_SentinelUndef) {
+ ShuffleMask.push_back(M);
+ continue;
+ }
+ // For AVX vectors with 32 bytes the base of the shuffle is the half of
+ // the vector we're inside.
+ int Base = i < 16 ? 0 : 16;
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (M & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (M & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ int ElementBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ for (int i = 0; i < NumElements; ++i) {
+ // If there are more than 8 elements in the vector, then any immediate blend
+ // mask applies to each 128-bit lane. There can never be more than
+ // 8 elements in a 128-bit lane with an immediate blend.
+ int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+ assert(Bit < 8 &&
+ "Immediate blends only operate over 8 elements at a time!");
+ ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+ }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != 4; ++i) {
+ ShuffleMask.push_back((Imm >> (2 * i)) & 3);
+ }
+}
+
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+ unsigned NumDstElts = DstVT.getVectorNumElements();
+ unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
+ unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+ unsigned Scale = DstScalarBits / SrcScalarBits;
+ assert(SrcScalarBits < DstScalarBits &&
+ "Expected zero extension mask to increase scalar size");
+ assert(SrcVT.getVectorNumElements() >= NumDstElts &&
+ "Too many zero extension lanes");
+
+ for (unsigned i = 0; i != NumDstElts; i++) {
+ Mask.push_back(i);
+ for (unsigned j = 1; j != Scale; j++)
+ Mask.push_back(SM_SentinelZero);
+ }
+}
+
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ ShuffleMask.push_back(0);
+ for (unsigned i = 1; i < NumElts; i++)
+ ShuffleMask.push_back(SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+ // First element comes from the first element of second source.
+ // Remaining elements: Load zero extends / Move copies from first source.
+ unsigned NumElts = VT.getVectorNumElements();
+ Mask.push_back(NumElts);
+ for (unsigned i = 1; i < NumElts; i++)
+ Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+
+void DecodeEXTRQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Only the bottom 6 bits are valid for each immediate.
+ Len &= 0x3F;
+ Idx &= 0x3F;
+
+ // We can only decode this bit extraction instruction as a shuffle if both the
+ // length and index work with whole bytes.
+ if (0 != (Len % 8) || 0 != (Idx % 8))
+ return;
+
+ // A length of zero is equivalent to a bit length of 64.
+ if (Len == 0)
+ Len = 64;
+
+ // If the length + index exceeds the bottom 64 bits the result is undefined.
+ if ((Len + Idx) > 64) {
+ ShuffleMask.append(16, SM_SentinelUndef);
+ return;
+ }
+
+ // Convert index and index to work with bytes.
+ Len /= 8;
+ Idx /= 8;
+
+ // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+ // of the lower 64-bits. The upper 64-bits are undefined.
+ for (int i = 0; i != Len; ++i)
+ ShuffleMask.push_back(i + Idx);
+ for (int i = Len; i != 8; ++i)
+ ShuffleMask.push_back(SM_SentinelZero);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Only the bottom 6 bits are valid for each immediate.
+ Len &= 0x3F;
+ Idx &= 0x3F;
+
+ // We can only decode this bit insertion instruction as a shuffle if both the
+ // length and index work with whole bytes.
+ if (0 != (Len % 8) || 0 != (Idx % 8))
+ return;
+
+ // A length of zero is equivalent to a bit length of 64.
+ if (Len == 0)
+ Len = 64;
+
+ // If the length + index exceeds the bottom 64 bits the result is undefined.
+ if ((Len + Idx) > 64) {
+ ShuffleMask.append(16, SM_SentinelUndef);
+ return;
+ }
+
+ // Convert index and index to work with bytes.
+ Len /= 8;
+ Idx /= 8;
+
+ // INSERTQ: Extract lowest Len bytes from lower half of second source and
+ // insert over first source starting at Idx byte. The upper 64-bits are
+ // undefined.
+ for (int i = 0; i != Idx; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 0; i != Len; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = Idx + Len; i != 8; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 0000000..72db6a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,122 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class MVT;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 0000000..fbec662
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,76 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86_H
+#define LLVM_LIB_TARGET_X86_X86_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class ImmutablePass;
+class X86TargetMachine;
+
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+/// This pass initializes a global base register for PIC on x86-32.
+FunctionPass* createX86GlobalBaseRegPass();
+
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
+/// Return a a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
+FunctionPass *createX86FixupLEAs();
+
+/// Return a pass that removes redundant address recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
+FunctionPass *createX86WinEHStatePass();
+
+/// Return a Machine IR pass that expands X86-specific pseudo
+/// instructions into a sequence of actual instructions. This pass
+/// must run after prologue/epilogue insertion and before lowering
+/// the MachineInstr to MC.
+FunctionPass *createX86ExpandPseudoPass();
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 0000000..8902a85
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,787 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+ "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+ "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+ "16-bit mode (i8086)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
+ "Enable conditional move instructions">;
+
+def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+ "Support POPCNT instruction">;
+
+def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
+ "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
+ "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+ "Support xsaveopt instructions">;
+
+def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+ "Support xsavec instructions">;
+
+def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+ "Support xsaves instructions">;
+
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions",
+ // SSE codegen depends on cmovs, and all
+ // SSE1+ processors support them.
+ [FeatureCMOV]>;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
+ "Enable SSE 4.1 instructions",
+ [FeatureSSSE3]>;
+def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
+ "Enable SSE 4.2 instructions",
+ [FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+ "Enable MMX instructions">;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions",
+ [FeatureMMX]>;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions",
+ [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
+ "64-bit with cmpxchg16b",
+ [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+ "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+ "SHLD instruction is slow">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+ "IsUAMem16Slow", "true",
+ "Slow unaligned 16-byte memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+ "IsUAMem32Slow", "true",
+ "Slow unaligned 32-byte memory access">;
+def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+ "Support SSE 4a instructions",
+ [FeatureSSE3]>;
+
+def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+ "Enable AVX instructions",
+ [FeatureSSE42]>;
+def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+ "Enable AVX2 instructions",
+ [FeatureAVX]>;
+def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+ "Enable AVX-512 instructions",
+ [FeatureAVX2]>;
+def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
+ "Enable AVX-512 Exponential and Reciprocal Instructions",
+ [FeatureAVX512]>;
+def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
+ "Enable AVX-512 Conflict Detection Instructions",
+ [FeatureAVX512]>;
+def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
+ "Enable AVX-512 PreFetch Instructions",
+ [FeatureAVX512]>;
+def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
+ "Enable AVX-512 Doubleword and Quadword Instructions",
+ [FeatureAVX512]>;
+def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
+ "Enable AVX-512 Byte and Word Instructions",
+ [FeatureAVX512]>;
+def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
+ "Enable AVX-512 Vector Length eXtensions",
+ [FeatureAVX512]>;
+def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
+ "Enable protection keys">;
+def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+ "Enable packed carry-less multiplication instructions",
+ [FeatureSSE2]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
+ "Enable four-operand fused multiple-add",
+ [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
+ "Enable XOP instructions",
+ [FeatureFMA4]>;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+ "HasSSEUnalignedMem", "true",
+ "Allow unaligned memory operands with SSE instructions">;
+def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
+ "Enable AES instructions",
+ [FeatureSSE2]>;
+def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
+ "Enable TBM instructions">;
+def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
+ "Support MOVBE instruction">;
+def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
+ "Support RDRAND instruction">;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions",
+ [FeatureAVX]>;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+ "Support FS/GS Base instructions">;
+def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+ "Support LZCNT instruction">;
+def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
+ "Support BMI instructions">;
+def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
+ "Support BMI2 instructions">;
+def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
+ "Support RTM instructions">;
+def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true",
+ "Support HLE">;
+def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
+ "Support ADX instructions">;
+def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
+ "Enable SHA instructions",
+ [FeatureSSE2]>;
+def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+ "Support PRFCHW instructions">;
+def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+ "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+ "Support LAHF and SAHF instructions">;
+def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
+ "Support MPX instructions">;
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+ "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+ "HasSlowDivide32", "true",
+ "Use 8-bit divide for positive values less than 256">;
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+ "HasSlowDivide64", "true",
+ "Use 16-bit divide for positive values less than 65536">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+ "PadShortFunctions", "true",
+ "Pad short functions">;
+// TODO: This feature ought to be renamed.
+// What it really refers to are CPUs for which certain instructions
+// (which ones besides the example below?) are microcoded.
+// The best examples of this are the memory forms of CALL and PUSH
+// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+ "CallRegIndirect", "true",
+ "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+ "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+ "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+ "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureSoftFloat
+ : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+ "Use software floating point features.">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+ "Intel Atom processors">;
+def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+ "Intel Silvermont processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic", [FeatureSlowUAMem16]>;
+def : Proc<"i386", [FeatureSlowUAMem16]>;
+def : Proc<"i486", [FeatureSlowUAMem16]>;
+def : Proc<"i586", [FeatureSlowUAMem16]>;
+def : Proc<"pentium", [FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686", [FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV,
+ FeatureFXSR]>;
+def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+ FeatureFXSR]>;
+def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+ FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+ FeatureFXSR]>;
+def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureSlowBTMem]>;
+
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+ [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+ FeatureSlowBTMem]>;
+
+// NetBurst.
+def : Proc<"prescott",
+ [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+ FeatureSlowBTMem]>;
+def : Proc<"nocona", [
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem
+]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel, [
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
+def : ProcessorModel<"penryn", SandyBridgeModel, [
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSE41,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
+
+// Atom CPUs.
+class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
+ ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureSlowBTMem,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureCallRegIndirect,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions,
+ FeatureLAHFSAHF
+]>;
+def : BonnellProc<"bonnell">;
+def : BonnellProc<"atom">; // Pin the generic name to the baseline.
+
+class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
+ ProcIntelSLM,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeatureAES,
+ FeatureSlowDivide64,
+ FeatureCallRegIndirect,
+ FeaturePRFCHW,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
+def : SilvermontProc<"silvermont">;
+def : SilvermontProc<"slm">; // Legacy alias.
+
+// "Arrandale" along with corei3 and corei5
+class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF
+]>;
+def : NehalemProc<"nehalem">;
+def : NehalemProc<"corei7">;
+
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureLAHFSAHF
+]>;
+def : WestmereProc<"westmere">;
+
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureSlowUAMem32,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureLAHFSAHF
+]>;
+def : SandyBridgeProc<"sandybridge">;
+def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
+
+class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureSlowUAMem32,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
+def : IvyBridgeProc<"ivybridge">;
+def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
+
+class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureRDRAND,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureSlowIncDec,
+ FeatureLAHFSAHF
+]>;
+def : HaswellProc<"haswell">;
+def : HaswellProc<"core-avx2">; // Legacy alias.
+
+class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureSlowIncDec,
+ FeatureLAHFSAHF
+]>;
+def : BroadwellProc<"broadwell">;
+
+// FIXME: define KNL model
+class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
+ FeatureMMX,
+ FeatureAVX512,
+ FeatureFXSR,
+ FeatureERI,
+ FeatureCDI,
+ FeaturePFI,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureSlowIncDec,
+ FeatureMPX,
+ FeatureLAHFSAHF
+]>;
+def : KnightsLandingProc<"knl">;
+
+// FIXME: define SKX model
+class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
+ FeatureMMX,
+ FeatureAVX512,
+ FeatureFXSR,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureSlowIncDec,
+ FeatureMPX,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureLAHFSAHF
+]>;
+def : SkylakeProc<"skylake">;
+def : SkylakeProc<"skx">; // Legacy alias.
+
+
+// AMD CPUs.
+
+def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+ FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+ FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+ FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+ FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+ FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+ FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+ FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+
+// Bobcat
+def : Proc<"btver1", [
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+
+// Jaguar
+def : ProcessorModel<"btver2", BtVer2Model, [
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureBMI,
+ FeatureF16C,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+
+// Bulldozer
+def : Proc<"bdver1", [
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+// Piledriver
+def : Proc<"bdver2", [
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
+
+// Steamroller
+def : Proc<"bdver3", [
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
+
+// Excavator
+def : Proc<"bdver4", [
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureXSAVEOPT,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
+
+def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>;
+
+def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>;
+
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+//
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+ [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit,
+ FeatureSlowBTMem ]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Variant name.
+ string Name = "att";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = "#";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+
+ // Variant name.
+ string Name = "intel";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = ";";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTInstPrinter";
+ int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelInstPrinter";
+ int Variant = 1;
+}
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+ let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..2170e62
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,706 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+
+ SMShadowTracker.startFunction(MF);
+
+ SetupMachineFunction(MF);
+
+ if (Subtarget->isTargetCOFF()) {
+ bool Intrn = MF.getFunction()->hasInternalLinkage();
+ OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer->EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
+ : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+ << COFF::SCT_COMPLEX_TYPE_SHIFT);
+ OutStreamer->EndCOFFSymbolDef();
+ }
+
+ // Emit the rest of the function body.
+ EmitFunctionBody();
+
+ // We didn't modify anything.
+ return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand. This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
+ raw_ostream &O) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown symbol type!");
+ case MachineOperand::MO_ConstantPoolIndex:
+ P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
+ P.printOffset(MO.getOffset(), O);
+ break;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+
+ MCSymbol *GVSym;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+ GVSym = P.getSymbolWithGlobalValueBase(GV, "$stub");
+ else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+ GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ else
+ GVSym = P.getSymbol(GV);
+
+ // Handle dllimport linkage.
+ if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+ GVSym =
+ P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+ MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+ } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+ MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
+ Sym);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+ } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+ MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (GVSym->getName()[0] != '$')
+ GVSym->print(O, P.MAI);
+ else {
+ O << '(';
+ GVSym->print(O, P.MAI);
+ O << ')';
+ }
+ P.printOffset(MO.getOffset(), O);
+ break;
+ }
+ }
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_DARWIN_STUB:
+ // These affect the name of the symbol, not any suffix.
+ break;
+ case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+ O << " + [.-";
+ P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ O << ']';
+ break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ O << '-';
+ P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ break;
+ case X86II::MO_TLSGD: O << "@TLSGD"; break;
+ case X86II::MO_TLSLD: O << "@TLSLD"; break;
+ case X86II::MO_TLSLDM: O << "@TLSLDM"; break;
+ case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
+ case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+ case X86II::MO_TPOFF: O << "@TPOFF"; break;
+ case X86II::MO_DTPOFF: O << "@DTPOFF"; break;
+ case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
+ case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+ case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
+ case X86II::MO_GOT: O << "@GOT"; break;
+ case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
+ case X86II::MO_PLT: O << "@PLT"; break;
+ case X86II::MO_TLVP: O << "@TLVP"; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ O << "@TLVP" << '-';
+ P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ break;
+ case X86II::MO_SECREL: O << "@SECREL32"; break;
+ }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O,
+ const char *Modifier = nullptr, unsigned AsmVariant = 0);
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Register:
+ // pc-relativeness was handled when computing the value in the reg.
+ printOperand(P, MI, OpNo, O);
+ return;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ printSymbolOperand(P, MO, O);
+ return;
+ }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O, const char *Modifier,
+ unsigned AsmVariant) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ // FIXME: Enumerating AsmVariant, so we can remove magic number.
+ if (AsmVariant == 0) O << '%';
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+ (strcmp(Modifier+6,"32") == 0) ? 32 :
+ (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+ Reg = getX86SubSuperRegister(Reg, Size);
+ }
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ if (AsmVariant == 0) O << '$';
+ O << MO.getImm();
+ return;
+
+ case MachineOperand::MO_GlobalAddress: {
+ if (AsmVariant == 0) O << '$';
+ printSymbolOperand(P, MO, O);
+ break;
+ }
+ }
+}
+
+static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned Op, raw_ostream &O,
+ const char *Modifier = nullptr) {
+ const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
+ // HasParenPart - True if we will print out the () part of the mem ref.
+ bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+ switch (DispSpec.getType()) {
+ default:
+ llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Immediate: {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || !HasParenPart)
+ O << DispVal;
+ break;
+ }
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ConstantPoolIndex:
+ printSymbolOperand(P, DispSpec, O);
+ }
+
+ if (Modifier && strcmp(Modifier, "H") == 0)
+ O << "+8";
+
+ if (HasParenPart) {
+ assert(IndexReg.getReg() != X86::ESP &&
+ "X86 doesn't allow scaling by ESP");
+
+ O << '(';
+ if (HasBaseReg)
+ printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned Op, raw_ostream &O,
+ const char *Modifier = nullptr) {
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+ if (Segment.getReg()) {
+ printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+ O << ':';
+ }
+ printLeaMemReference(P, MI, Op, O, Modifier);
+}
+
+static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+ unsigned Op, raw_ostream &O,
+ const char *Modifier = nullptr,
+ unsigned AsmVariant = 1) {
+ const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+ const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+ O << ']';
+}
+
+static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+ char Mode, raw_ostream &O) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, 8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, 8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, 16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, 32);
+ break;
+ case 'q':
+ // Print 64-bit register names if 64-bit integer registers are available.
+ // Otherwise, print 32-bit register names.
+ Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
+ break;
+ }
+
+ O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'a': // This is an address. Currently only 'i' and 'r' are expected.
+ switch (MO.getType()) {
+ default:
+ return true;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return false;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ llvm_unreachable("unexpected operand type!");
+ case MachineOperand::MO_GlobalAddress:
+ printSymbolOperand(*this, MO, O);
+ if (Subtarget->isPICStyleRIPRel())
+ O << "(%rip)";
+ return false;
+ case MachineOperand::MO_Register:
+ O << '(';
+ printOperand(*this, MI, OpNo, O);
+ O << ')';
+ return false;
+ }
+
+ case 'c': // Don't print "$" before a global var name or constant.
+ switch (MO.getType()) {
+ default:
+ printOperand(*this, MI, OpNo, O);
+ break;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ llvm_unreachable("unexpected operand type!");
+ case MachineOperand::MO_GlobalAddress:
+ printSymbolOperand(*this, MO, O);
+ break;
+ }
+ return false;
+
+ case 'A': // Print '*' before a register (it must be a register)
+ if (MO.isReg()) {
+ O << '*';
+ printOperand(*this, MI, OpNo, O);
+ return false;
+ }
+ return true;
+
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print DImode register
+ if (MO.isReg())
+ return printAsmMRegister(*this, MO, ExtraCode[0], O);
+ printOperand(*this, MI, OpNo, O);
+ return false;
+
+ case 'P': // This is the operand of a call, treat specially.
+ printPCRelImm(*this, MI, OpNo, O);
+ return false;
+
+ case 'n': // Negate the immediate or print a '-' before the operand.
+ // Note: this is a temporary solution. It should be handled target
+ // independently as part of the 'MC' work.
+ if (MO.isImm()) {
+ O << -MO.getImm();
+ return false;
+ }
+ O << '-';
+ }
+ }
+
+ printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+ return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (AsmVariant) {
+ printIntelMemReference(*this, MI, OpNo, O);
+ return false;
+ }
+
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print SImode register
+ // These only apply to registers, ignore on mem.
+ break;
+ case 'H':
+ printMemReference(*this, MI, OpNo, O, "H");
+ return false;
+ case 'P': // Don't print @PLT, but do print as memory.
+ printMemReference(*this, MI, OpNo, O, "no-rip");
+ return false;
+ }
+ }
+ printMemReference(*this, MI, OpNo, O);
+ return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatMachO())
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+ if (TT.isOSBinFormatCOFF()) {
+ // Emit an absolute @feat.00 symbol. This appears to be some kind of
+ // compiler features bitfield read by link.exe.
+ if (TT.getArch() == Triple::x86) {
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
+ OutStreamer->BeginCOFFSymbolDef(S);
+ OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+ OutStreamer->EndCOFFSymbolDef();
+ // According to the PE-COFF spec, the LSB of this value marks the object
+ // for "registered SEH". This means that all SEH handler entry points
+ // must be registered in .sxdata. Use of any unregistered handlers will
+ // cause the process to terminate immediately. LLVM does not know how to
+ // register any SEH handlers, so its object files should be safe.
+ OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ OutStreamer->EmitAssignment(
+ S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
+ }
+ }
+ OutStreamer->EmitSyntaxDirective();
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+ MachineModuleInfoImpl::StubValueTy &MCSym) {
+ // L_foo$stub:
+ OutStreamer.EmitLabel(StubLabel);
+ // .indirect_symbol _foo
+ OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.EmitIntValue(0, 4/*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs; however, sometimes the types are local to the file.
+ // We need to fill in the value for the NLP in those cases.
+ OutStreamer.EmitValue(
+ MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+ 4 /*size*/);
+}
+
+MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
+ if (Subtarget->isTargetKnownWindowsMSVC()) {
+ const MachineConstantPoolEntry &CPE =
+ MF->getConstantPool()->getConstants()[CPID];
+ if (!CPE.isMachineConstantPoolEntry()) {
+ const DataLayout &DL = MF->getDataLayout();
+ SectionKind Kind = CPE.getSectionKind(&DL);
+ const Constant *C = CPE.Val.ConstVal;
+ if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+ getObjFileLowering().getSectionForConstant(DL, Kind, C))) {
+ if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+ if (Sym->isUndefined())
+ OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+ return Sym;
+ }
+ }
+ }
+ }
+
+ return AsmPrinter::GetCPISymbol(CPID);
+}
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatMachO()) {
+ // All darwin targets use mach-o.
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs;
+
+ Stubs = MMIMacho.GetFnStubList();
+ if (!Stubs.empty()) {
+ MCSection *TheSection = OutContext.getMachOSection(
+ "__IMPORT", "__jump_table",
+ MachO::S_SYMBOL_STUBS | MachO::S_ATTR_SELF_MODIFYING_CODE |
+ MachO::S_ATTR_PURE_INSTRUCTIONS,
+ 5, SectionKind::getMetadata());
+ OutStreamer->SwitchSection(TheSection);
+
+ for (const auto &Stub : Stubs) {
+ // L_foo$stub:
+ OutStreamer->EmitLabel(Stub.first);
+ // .indirect_symbol _foo
+ OutStreamer->EmitSymbolAttribute(Stub.second.getPointer(),
+ MCSA_IndirectSymbol);
+ // hlt; hlt; hlt; hlt; hlt hlt = 0xf4.
+ const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
+ OutStreamer->EmitBytes(StringRef(HltInsts, 5));
+ }
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+
+ // Output stubs for external and common global variables.
+ Stubs = MMIMacho.GetGVStubList();
+ if (!Stubs.empty()) {
+ MCSection *TheSection = OutContext.getMachOSection(
+ "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata());
+ OutStreamer->SwitchSection(TheSection);
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+
+ Stubs = MMIMacho.GetHiddenGVStubList();
+ if (!Stubs.empty()) {
+ MCSection *TheSection = OutContext.getMachOSection(
+ "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata());
+ OutStreamer->SwitchSection(TheSection);
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer->AddBlankLine();
+ }
+
+ SM.serializeToStackMapSection();
+ FM.serializeToFaultMapSection();
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ }
+
+ if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
+ StringRef SymbolName =
+ (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+ OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ }
+
+ if (TT.isOSBinFormatCOFF()) {
+ const TargetLoweringObjectFileCOFF &TLOFCOFF =
+ static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+ std::string Flags;
+ raw_string_ostream FlagsOS(Flags);
+
+ for (const auto &Function : M)
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function, *Mang);
+ for (const auto &Global : M.globals())
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global, *Mang);
+ for (const auto &Alias : M.aliases())
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias, *Mang);
+
+ FlagsOS.flush();
+
+ // Output collected flags.
+ if (!Flags.empty()) {
+ OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
+ OutStreamer->EmitBytes(Flags);
+ }
+
+ SM.serializeToStackMapSection();
+ }
+
+ if (TT.isOSBinFormatELF()) {
+ SM.serializeToStackMapSection();
+ FM.serializeToFaultMapSection();
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+ RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+ RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 0000000..9c8bd98
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,131 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/Target/TargetMachine.h"
+
+// Implemented in X86MCInstLower.cpp
+namespace {
+ class X86MCInstLower;
+}
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+ const X86Subtarget *Subtarget;
+ StackMaps SM;
+ FaultMaps FM;
+
+ // This utility class tracks the length of a stackmap instruction's 'shadow'.
+ // It is used by the X86AsmPrinter to ensure that the stackmap shadow
+ // invariants (i.e. no other stackmaps, patchpoints, or control flow within
+ // the shadow) are met, while outputting a minimal number of NOPs for padding.
+ //
+ // To minimise the number of NOPs used, the shadow tracker counts the number
+ // of instruction bytes output since the last stackmap. Only if there are too
+ // few instruction bytes to cover the shadow are NOPs used for padding.
+ class StackMapShadowTracker {
+ public:
+ StackMapShadowTracker(TargetMachine &TM);
+ ~StackMapShadowTracker();
+ void startFunction(MachineFunction &MF);
+ void count(MCInst &Inst, const MCSubtargetInfo &STI);
+
+ // Called to signal the start of a shadow of RequiredSize bytes.
+ void reset(unsigned RequiredSize) {
+ RequiredShadowSize = RequiredSize;
+ CurrentShadowSize = 0;
+ InShadow = true;
+ }
+
+ // Called before every stackmap/patchpoint, and at the end of basic blocks,
+ // to emit any necessary padding-NOPs.
+ void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
+ private:
+ TargetMachine &TM;
+ const MachineFunction *MF;
+ std::unique_ptr<MCCodeEmitter> CodeEmitter;
+ bool InShadow;
+
+ // RequiredShadowSize holds the length of the shadow specified in the most
+ // recently encountered STACKMAP instruction.
+ // CurrentShadowSize counts the number of bytes encoded since the most
+ // recently encountered STACKMAP, stopping when that number is greater than
+ // or equal to RequiredShadowSize.
+ unsigned RequiredShadowSize, CurrentShadowSize;
+ };
+
+ StackMapShadowTracker SMShadowTracker;
+
+ // All instructions emitted by the X86AsmPrinter should use this helper
+ // method.
+ //
+ // This helper function invokes the SMShadowTracker on each instruction before
+ // outputting it to the OutStream. This allows the shadow tracker to minimise
+ // the number of NOPs used for stackmap padding.
+ void EmitAndCountInstruction(MCInst &Inst);
+ void LowerSTACKMAP(const MachineInstr &MI);
+ void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+ void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
+
+ public:
+ explicit X86AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this),
+ SMShadowTracker(TM) {}
+
+ const char *getPassName() const override {
+ return "X86 Assembly / Object Emitter";
+ }
+
+ const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+ void EmitStartOfAsmFile(Module &M) override;
+
+ void EmitEndOfAsmFile(Module &M) override;
+
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ }
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+
+ /// \brief Return the symbol for the specified constant pool entry.
+ MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
+ bool doInitialization(Module &M) override {
+ SMShadowTracker.reset(0);
+ SM.reset();
+ return AsmPrinter::doInitialization(M);
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 0000000..fc6ee17
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,558 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+// the transformation is preformed pre-reg-alloc, it can help relieve
+// register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+static cl::opt<bool>
+ NoX86CFOpt("no-x86-call-frame-opt",
+ cl::desc("Avoid optimizing x86 call frames for size"),
+ cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+ X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ // Information we know about a particular call site
+ struct CallContext {
+ CallContext()
+ : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+ MovVector(4, nullptr), NoStackParams(false), UsePush(false){}
+
+ // Iterator referring to the frame setup instruction
+ MachineBasicBlock::iterator FrameSetup;
+
+ // Actual call instruction
+ MachineInstr *Call;
+
+ // A copy of the stack pointer
+ MachineInstr *SPCopy;
+
+ // The total displacement of all passed parameters
+ int64_t ExpectedDist;
+
+ // The sequence of movs used to pass the parameters
+ SmallVector<MachineInstr *, 4> MovVector;
+
+ // True if this call site has no stack parameters
+ bool NoStackParams;
+
+ // True of this callsite can use push instructions
+ bool UsePush;
+ };
+
+ typedef SmallVector<CallContext, 8> ContextVector;
+
+ bool isLegal(MachineFunction &MF);
+
+ bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
+
+ void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, CallContext &Context);
+
+ bool adjustCallSequence(MachineFunction &MF, const CallContext &Context);
+
+ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+ unsigned Reg);
+
+ enum InstClassification { Convert, Skip, Exit };
+
+ InstClassification classifyInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const X86RegisterInfo &RegInfo,
+ DenseSet<unsigned int> &UsedRegs);
+
+ const char *getPassName() const override { return "X86 Optimize Call Frame"; }
+
+ const TargetInstrInfo *TII;
+ const X86FrameLowering *TFL;
+ const X86Subtarget *STI;
+ const MachineRegisterInfo *MRI;
+ static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+ return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal.
+// Also returns false in cases where it's potentially legal, but
+// we don't even want to try.
+bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
+ if (NoX86CFOpt.getValue())
+ return false;
+
+ // We currently only support call sequences where *all* parameters.
+ // are passed on the stack.
+ // No point in running this in 64-bit mode, since some arguments are
+ // passed in-register in all common calling conventions, so the pattern
+ // we're looking for will never match.
+ if (STI->is64Bit())
+ return false;
+
+ // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+ // in the compact unwind encoding that Darwin uses. So, bail if there
+ // is a danger of that being generated.
+ if (STI->isTargetDarwin() &&
+ (!MF.getMMI().getLandingPads().empty() ||
+ (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
+ return false;
+
+ // You would expect straight-line code between call-frame setup and
+ // call-frame destroy. You would be wrong. There are circumstances (e.g.
+ // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+ // end up with the setup and the destroy in different basic blocks.
+ // This is bad, and breaks SP adjustment.
+ // So, check that all of the frames in the function are closed inside
+ // the same block, and, for good measure, that there are no nested frames.
+ unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+ unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ for (MachineBasicBlock &BB : MF) {
+ bool InsideFrameSequence = false;
+ for (MachineInstr &MI : BB) {
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ if (InsideFrameSequence)
+ return false;
+ InsideFrameSequence = true;
+ } else if (MI.getOpcode() == FrameDestroyOpcode) {
+ if (!InsideFrameSequence)
+ return false;
+ InsideFrameSequence = false;
+ }
+ }
+
+ if (InsideFrameSequence)
+ return false;
+ }
+
+ return true;
+}
+
+// Check whether this trasnformation is profitable for a particular
+// function - in terms of code size.
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
+ ContextVector &CallSeqVector) {
+ // This transformation is always a win when we do not expect to have
+ // a reserved call frame. Under other circumstances, it may be either
+ // a win or a loss, and requires a heuristic.
+ bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+ if (CannotReserveFrame)
+ return true;
+
+ // Don't do this when not optimizing for size.
+ if (!MF.getFunction()->optForSize())
+ return false;
+
+ unsigned StackAlign = TFL->getStackAlignment();
+
+ int64_t Advantage = 0;
+ for (auto CC : CallSeqVector) {
+ // Call sites where no parameters are passed on the stack
+ // do not affect the cost, since there needs to be no
+ // stack adjustment.
+ if (CC.NoStackParams)
+ continue;
+
+ if (!CC.UsePush) {
+ // If we don't use pushes for a particular call site,
+ // we pay for not having a reserved call frame with an
+ // additional sub/add esp pair. The cost is ~3 bytes per instruction,
+ // depending on the size of the constant.
+ // TODO: Callee-pop functions should have a smaller penalty, because
+ // an add is needed even with a reserved call frame.
+ Advantage -= 6;
+ } else {
+ // We can use pushes. First, account for the fixed costs.
+ // We'll need a add after the call.
+ Advantage -= 3;
+ // If we have to realign the stack, we'll also need and sub before
+ if (CC.ExpectedDist % StackAlign)
+ Advantage -= 3;
+ // Now, for each push, we save ~3 bytes. For small constants, we actually,
+ // save more (up to 5 bytes), but 3 should be a good approximation.
+ Advantage += (CC.ExpectedDist / 4) * 3;
+ }
+ }
+
+ return (Advantage >= 0);
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ TFL = STI->getFrameLowering();
+ MRI = &MF.getRegInfo();
+
+ if (!isLegal(MF))
+ return false;
+
+ unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+ bool Changed = false;
+
+ ContextVector CallSeqVector;
+
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+ if (I->getOpcode() == FrameSetupOpcode) {
+ CallContext Context;
+ collectCallInfo(MF, *BB, I, Context);
+ CallSeqVector.push_back(Context);
+ }
+
+ if (!isProfitable(MF, CallSeqVector))
+ return false;
+
+ for (auto CC : CallSeqVector)
+ if (CC.UsePush)
+ Changed |= adjustCallSequence(MF, CC);
+
+ return Changed;
+}
+
+X86CallFrameOptimization::InstClassification
+X86CallFrameOptimization::classifyInstruction(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
+ if (MI == MBB.end())
+ return Exit;
+
+ // The instructions we actually care about are movs onto the stack
+ int Opcode = MI->getOpcode();
+ if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
+ return Convert;
+
+ // Not all calling conventions have only stack MOVs between the stack
+ // adjust and the call.
+
+ // We want to tolerate other instructions, to cover more cases.
+ // In particular:
+ // a) PCrel calls, where we expect an additional COPY of the basereg.
+ // b) Passing frame-index addresses.
+ // c) Calling conventions that have inreg parameters. These generate
+ // both copies and movs into registers.
+ // To avoid creating lots of special cases, allow any instruction
+ // that does not write into memory, does not def or use the stack
+ // pointer, and does not def any register that was used by a preceding
+ // push.
+ // (Reading from memory is allowed, even if referenced through a
+ // frame index, since these will get adjusted properly in PEI)
+
+ // The reason for the last condition is that the pushes can't replace
+ // the movs in place, because the order must be reversed.
+ // So if we have a MOV32mr that uses EDX, then an instruction that defs
+ // EDX, and then the call, after the transformation the push will use
+ // the modified version of EDX, and not the original one.
+ // Since we are still in SSA form at this point, we only need to
+ // make sure we don't clobber any *physical* registers that were
+ // used by an earlier mov that will become a push.
+
+ if (MI->isCall() || MI->mayStore())
+ return Exit;
+
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned int Reg = MO.getReg();
+ if (!RegInfo.isPhysicalRegister(Reg))
+ continue;
+ if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
+ return Exit;
+ if (MO.isDef()) {
+ for (unsigned int U : UsedRegs)
+ if (RegInfo.regsOverlap(Reg, U))
+ return Exit;
+ }
+ }
+
+ return Skip;
+}
+
+void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ CallContext &Context) {
+ // Check that this particular call sequence is amenable to the
+ // transformation.
+ const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+ STI->getRegisterInfo());
+ unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+ // We expect to enter this at the beginning of a call sequence
+ assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+ MachineBasicBlock::iterator FrameSetup = I++;
+ Context.FrameSetup = FrameSetup;
+
+ // How much do we adjust the stack? This puts an upper bound on
+ // the number of parameters actually passed on it.
+ unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+
+ // A zero adjustment means no stack parameters
+ if (!MaxAdjust) {
+ Context.NoStackParams = true;
+ return;
+ }
+
+ // For globals in PIC mode, we can have some LEAs here.
+ // Ignore them, they don't bother us.
+ // TODO: Extend this to something that covers more cases.
+ while (I->getOpcode() == X86::LEA32r)
+ ++I;
+
+ // We expect a copy instruction here.
+ // TODO: The copy instruction is a lowering artifact.
+ // We should also support a copy-less version, where the stack
+ // pointer is used directly.
+ if (!I->isCopy() || !I->getOperand(0).isReg())
+ return;
+ Context.SPCopy = I++;
+
+ unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
+
+ // Scan the call setup sequence for the pattern we're looking for.
+ // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+ // instructions, that push a sequence of 32-bit values onto the stack, with
+ // no gaps between them.
+ if (MaxAdjust > 4)
+ Context.MovVector.resize(MaxAdjust, nullptr);
+
+ InstClassification Classification;
+ DenseSet<unsigned int> UsedRegs;
+
+ while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
+ Exit) {
+ if (Classification == Skip) {
+ ++I;
+ continue;
+ }
+
+ // We know the instruction is a MOV32mi/MOV32mr.
+ // We only want movs of the form:
+ // movl imm/r32, k(%esp)
+ // If we run into something else, bail.
+ // Note that AddrBaseReg may, counter to its name, not be a register,
+ // but rather a frame index.
+ // TODO: Support the fi case. This should probably work now that we
+ // have the infrastructure to track the stack pointer within a call
+ // sequence.
+ if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+ !I->getOperand(X86::AddrScaleAmt).isImm() ||
+ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+ !I->getOperand(X86::AddrDisp).isImm())
+ return;
+
+ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+ assert(StackDisp >= 0 &&
+ "Negative stack displacement when passing parameters");
+
+ // We really don't want to consider the unaligned case.
+ if (StackDisp % 4)
+ return;
+ StackDisp /= 4;
+
+ assert((size_t)StackDisp < Context.MovVector.size() &&
+ "Function call has more parameters than the stack is adjusted for.");
+
+ // If the same stack slot is being filled twice, something's fishy.
+ if (Context.MovVector[StackDisp] != nullptr)
+ return;
+ Context.MovVector[StackDisp] = I;
+
+ for (const MachineOperand &MO : I->uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned int Reg = MO.getReg();
+ if (RegInfo.isPhysicalRegister(Reg))
+ UsedRegs.insert(Reg);
+ }
+
+ ++I;
+ }
+
+ // We now expect the end of the sequence. If we stopped early,
+ // or reached the end of the block without finding a call, bail.
+ if (I == MBB.end() || !I->isCall())
+ return;
+
+ Context.Call = I;
+ if ((++I)->getOpcode() != FrameDestroyOpcode)
+ return;
+
+ // Now, go through the vector, and see that we don't have any gaps,
+ // but only a series of 32-bit MOVs.
+ auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
+ for (; MMI != MME; ++MMI, Context.ExpectedDist += 4)
+ if (*MMI == nullptr)
+ break;
+
+ // If the call had no parameters, do nothing
+ if (MMI == Context.MovVector.begin())
+ return;
+
+ // We are either at the last parameter, or a gap.
+ // Make sure it's not a gap
+ for (; MMI != MME; ++MMI)
+ if (*MMI != nullptr)
+ return;
+
+ Context.UsePush = true;
+ return;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+ const CallContext &Context) {
+ // Ok, we can in fact do the transformation for this call.
+ // Do not remove the FrameSetup instruction, but adjust the parameters.
+ // PEI will end up finalizing the handling of this.
+ MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+ MachineBasicBlock &MBB = *(FrameSetup->getParent());
+ FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+
+ DebugLoc DL = FrameSetup->getDebugLoc();
+ // Now, iterate through the vector in reverse order, and replace the movs
+ // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+ // replace uses.
+ for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+ MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
+ MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+ MachineBasicBlock::iterator Push = nullptr;
+ if (MOV->getOpcode() == X86::MOV32mi) {
+ unsigned PushOpcode = X86::PUSHi32;
+ // If the operand is a small (8-bit) immediate, we can use a
+ // PUSH instruction with a shorter encoding.
+ // Note that isImm() may fail even though this is a MOVmi, because
+ // the operand can also be a symbol.
+ if (PushOp.isImm()) {
+ int64_t Val = PushOp.getImm();
+ if (isInt<8>(Val))
+ PushOpcode = X86::PUSH32i8;
+ }
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+ .addOperand(PushOp);
+ } else {
+ unsigned int Reg = PushOp.getReg();
+
+ // If PUSHrmm is not slow on this target, try to fold the source of the
+ // push into the instruction.
+ bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+
+ // Check that this is legal to fold. Right now, we're extremely
+ // conservative about that.
+ MachineInstr *DefMov = nullptr;
+ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+
+ unsigned NumOps = DefMov->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+ Push->addOperand(DefMov->getOperand(i));
+
+ DefMov->eraseFromParent();
+ } else {
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
+ .addReg(Reg)
+ .getInstr();
+ }
+ }
+
+ // For debugging, when using SP-based CFA, we need to adjust the CFA
+ // offset after each push.
+ // TODO: This is needed only if we require precise CFA.
+ if (!TFL->hasFP(MF))
+ TFL->BuildCFI(MBB, std::next(Push), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
+
+ MBB.erase(MOV);
+ }
+
+ // The stack-pointer copy is no longer used in the call sequences.
+ // There should not be any other users, but we can't commit to that, so:
+ if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+ Context.SPCopy->eraseFromParent();
+
+ // Once we've done this, we need to make sure PEI doesn't assume a reserved
+ // frame.
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ FuncInfo->setHasPushSequences(true);
+
+ return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+ MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+ // Do an extremely restricted form of load folding.
+ // ISel will often create patterns like:
+ // movl 4(%edi), %eax
+ // movl 8(%edi), %ecx
+ // movl 12(%edi), %edx
+ // movl %edx, 8(%esp)
+ // movl %ecx, 4(%esp)
+ // movl %eax, (%esp)
+ // call
+ // Get rid of those with prejudice.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+
+ // Make sure this is the only use of Reg.
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return nullptr;
+
+ MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+ // Make sure the def is a MOV from memory.
+ // If the def is an another block, give up.
+ if (DefMI->getOpcode() != X86::MOV32rm ||
+ DefMI->getParent() != FrameSetup->getParent())
+ return nullptr;
+
+ // Make sure we don't have any instructions between DefMI and the
+ // push that make folding the load illegal.
+ for (auto I = DefMI; I != FrameSetup; ++I)
+ if (I->isLoadFoldBarrier())
+ return nullptr;
+
+ return DefMI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 0000000..a08160f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,107 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ // Similar to CCPassIndirect, with the addition of inreg.
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::Indirect;
+ ArgFlags.setInReg();
+ return false; // Continue the search, but now for i32.
+}
+
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the " \
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to X86 C calling convention on Release builds.
+ return false;
+}
+
+inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+ // not to split i64 and double between a register and stack
+ static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+ static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // If this is the first part of an double/i64/i128, or if we're already
+ // in the middle of a split, add to the pending list. If this is not
+ // the end of the split, return, otherwise go on to process the pending
+ // list
+ if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+ }
+
+ // If there are no pending members, we are not in the middle of a split,
+ // so do the usual inreg stuff.
+ if (PendingMembers.empty()) {
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+ }
+
+ assert(ArgFlags.isSplitEnd());
+
+ // We now have the entire original argument in PendingMembers, so decide
+ // whether to use registers or the stack.
+ // Per the MCU ABI:
+ // a) To use registers, we need to have enough of them free to contain
+ // the entire argument.
+ // b) We never want to use more than 2 registers for a single argument.
+
+ unsigned FirstFree = State.getFirstUnallocated(RegList);
+ bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+ for (auto &It : PendingMembers) {
+ if (UseRegs)
+ It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+ else
+ It.convertToMem(State.AllocateStack(4, 4));
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..54d88cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,881 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("static_cast<const X86Subtarget&>"
+ "(State.getMachineFunction().getSubtarget()).", F),
+ A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX. For i8, the ABI
+ // requires the values to be in AL and AH, however this code uses AL and DL
+ // instead. This is because using AH for the second register conflicts with
+ // the way LLVM does multiple return values -- a return of {i16,i8} would end
+ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+ // for functions that return two i8 values are currently expected to pack the
+ // values into an i16 (which uses AX, and thus AL:AH).
+ //
+ // For code that doesn't care about the ABI, we allow returning more than two
+ // integer values in registers.
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+ // Boolean vectors of AVX-512 are returned in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3
+ // can only be used by ABI non-compliant code. If the target doesn't have XMM
+ // registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX target feature.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX-512 target feature.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+ // Long double types are always returned in FP0 (even with SSE).
+ CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in FP0, unless marked
+ // with "inreg" (used here to distinguish one kind of reg from another,
+ // weirdly; this is really the sse-regparm calling convention) in which
+ // case they use XMM0, otherwise it is the same as the common X86 calling
+ // conv.
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
+ CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+ CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+ // SSE2.
+ // This can happen when a float, 2 x float, or 3 x float vector is split by
+ // target lowering, and is returned in 1-3 sse regs.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+ // For integers, ECX can be used as an extra return register
+ CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+ // Otherwise, it is the same as the common X86 calling convention.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit FP vectors
+ // No more than 4 registers
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit FP vectors
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // i32, i64 in the standard way
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+ // Promote all types to i32
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit FP vectors
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit FP vectors
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // Return integers in the standard way.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // MMX vector types are always returned in XMM0.
+ CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+ // The X86-Win64 calling convention always returns __m64 values in RAX.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // Otherwise, everything is the same as 'normal' X86-64 C CC.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: RAX
+ CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+ CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: could return in any GP register save RSP and R12.
+ CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14, R15]>>
+]>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // If HiPE, use RetCC_X86_32_HiPE.
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // HiPE uses RetCC_X86_64_HiPE
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+ // Handle JavaScript calls.
+ CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+ // Handle explicit CC selection
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
+ // Handle HHVM calls.
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+ // Check if this is the Intel OpenCL built-ins calling convention
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+ CCIfType<[x86mmx],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCIfSubtarget<"hasSSE2()",
+ CCPromoteToType<v2i64>>>>,
+
+ // Boolean vectors of AVX-512 are passed in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 256-bit vector arguments are passed in YMM registers, unless
+ // this is a vararg function.
+ // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+ // fixed arguments to vararg functions are supposed to be passed in
+ // registers. Actually modeling that would be a lot of work, though.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasFp256()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+ YMM4, YMM5, YMM6, YMM7]>>>>,
+
+ // The first 8 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+ // Use all/any GP registers for args, except RSP.
+ CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+ RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+ // Pass the first argument in RBP.
+ CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+ // Otherwise it's the same as the regular C calling convention.
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+ // FIXME: Handle byval stuff.
+ // FIXME: Handle varargs.
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // 128 bit vectors are passed by pointer
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+ // 256 bit vectors are passed by pointer
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+ // 512 bit vectors are passed by pointer
+ CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+ // The first 4 MMX vector arguments are passed in GPRs.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Do not pass the sret argument in RCX, the Win64 thiscall calling
+ // convention requires "this" to be passed in RCX.
+ CCIfCC<"CallingConv::X86_ThisCall",
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ],
+ [XMM1, XMM2, XMM3]>>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_Win64_VectorCall : CallingConv<[
+ // The first 6 floating point and vector types of 128 bits or less use
+ // XMM0-XMM5.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+ // 256-bit vectors use YMM registers.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+ // 512-bit vectors use ZMM registers.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+ // Delegate to fastcall to handle integer types.
+ CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
+def CC_X86_64_GHC : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+ CCIfType<[i64],
+ CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+ // Pass in STG registers: F1, F2, F3, F4, D1, D2
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_WebKit_JS : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Only the first integer argument is passed in register.
+ CCIfType<[i32], CCAssignToReg<[EAX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+ // The remaining integer arguments are passed on the stack. 32bit integer and
+ // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+ // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+ // in 8 byte stack slots.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+ CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_32_Vector_Common : CallingConv<[
+ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
+// vector registers
+def CC_X86_32_Vector_Standard : CallingConv<[
+ // SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
+
+ // AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasFp256()",
+ CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
+
+ // AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
+
+ CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in
+// vector registers.
+def CC_X86_32_Vector_Darwin : CallingConv<[
+ // SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+ // AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasFp256()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+ // AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
+ CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack.
+def CC_X86_32_Common : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // The first 3 float or double arguments, if marked 'inreg' and if the call
+ // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+ // The first 3 __m64 vector arguments are passed in mmx registers if the
+ // call is not a vararg call.
+ CCIfNotVarArg<CCIfType<[x86mmx],
+ CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // Long doubles get slots whose size depends on the subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+ // Boolean vectors of AVX-512 are passed in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+ // Darwin passes vectors in a form that differs from the i386 psABI
+ CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
+
+ // Otherwise, drop to 'normal' X86-32 CC
+ CCDelegateTo<CC_X86_32_Vector_Standard>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_MCU : CallingConv<[
+ // Handles byval parameters. Note that, like FastCC, we can't rely on
+ // the delegation to CC_X86_32_Common because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // If the call is not a vararg call, some arguments may be passed
+ // in integer registers.
+ CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_VectorCall : CallingConv<[
+ // The first 6 floating point and vector types of 128 bits or less use
+ // XMM0-XMM5.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+ // 256-bit vectors use YMM registers.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+ // 512-bit vectors use ZMM registers.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+ // Otherwise, pass it indirectly.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
+ v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
+ v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCCustom<"CC_X86_32_VectorCallIndirect">>,
+
+ // Delegate to fastcall to handle integer types.
+ CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
+def CC_X86_32_ThisCall_Common : CallingConv<[
+ // The first integer argument is passed in ECX
+ CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // Pass sret arguments indirectly through stack.
+ CCIfSRet<CCAssignToStack<4, 4>>,
+
+ CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+ CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+ CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+ // Handles byval parameters. Note that we can't rely on the delegation
+ // to CC_X86_32_Common for this because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // The first 3 float or double arguments, if the call is not a vararg
+ // call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+ // Doubles get 8-byte slots that are 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1
+ CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+ CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+ CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>,
+
+ CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+ CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+ CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The 256-bit vector arguments are passed in YMM registers.
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+ // The 512-bit vector arguments are passed in ZMM registers.
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
+ // Pass masks in mask registers
+ CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+def CC_X86_32_Intr : CallingConv<[
+ CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+ CCAssignToStack<8, 8>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+ CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
+ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
+
+ // Otherwise, drop to normal X86-32 CC
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+ CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+ CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+ CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+ (sequence "XMM%u", 6, 15))>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+ R8, R9, R10, R11)>;
+
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R10, RSP)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+ (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+ (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+ R11, R12, R13, R14, R15, RBP,
+ (sequence "XMM%u", 0, 15))>;
+
+def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+ EDI, ESP)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "XMM%u", 0, 7))>;
+
+def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
+ (sequence "XMM%u", 16, 31))>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
+ (sequence "YMM%u", 0, 31)),
+ (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+ R13, R14, R15,
+ (sequence "YMM%u", 6, 15))>;
+
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+ R12, R13, R14, R15,
+ (sequence "ZMM%u", 6, 21),
+ K4, K5, K6, K7)>;
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
+ (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
+ (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+ (sequence "ZMM%u", 16, 31),
+ K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
new file mode 100644
index 0000000..a09d065
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -0,0 +1,198 @@
+//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, other late
+// optimizations, or simply the encoding of the instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/IR/GlobalValue.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pseudo"
+
+namespace {
+class X86ExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ X86ExpandPseudo() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const X86Subtarget *STI;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
+ const X86FrameLowering *X86FL;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ const char *getPassName() const override {
+ return "X86 pseudo instruction expansion pass";
+ }
+
+private:
+ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool ExpandMBB(MachineBasicBlock &MBB);
+};
+char X86ExpandPseudo::ID = 0;
+} // End anonymous namespace.
+
+/// If \p MBBI is a pseudo instruction, this method expands
+/// it to the corresponding (sequence of) actual instruction(s).
+/// \returns true if \p MBBI has been expanded.
+bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64: {
+ bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+
+ if (StackAdj) {
+ // Check for possible merge with preceding ADD instruction.
+ StackAdj += X86FL->mergeSPUpdates(MBB, MBBI, true);
+ X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+ }
+
+ // Jump to label or value in register.
+ bool IsWin64 = STI->isTargetWin64();
+ if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) {
+ unsigned Op = (Opcode == X86::TCRETURNdi)
+ ? X86::TAILJMPd
+ : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+ if (JumpTarget.isGlobal())
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+ } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
+ unsigned Op = (Opcode == X86::TCRETURNmi)
+ ? X86::TAILJMPm
+ : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+ for (unsigned i = 0; i != 5; ++i)
+ MIB.addOperand(MBBI->getOperand(i));
+ } else if (Opcode == X86::TCRETURNri64) {
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ }
+
+ MachineInstr *NewMI = std::prev(MBBI);
+ NewMI->copyImplicitOps(*MBBI->getParent()->getParent(), MBBI);
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+
+ return true;
+ }
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ const bool Uses64BitFramePtr =
+ STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+ unsigned StackPtr = TRI->getStackRegister();
+ BuildMI(MBB, MBBI, DL,
+ TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+ .addReg(DestAddr.getReg());
+ // The EH_RETURN pseudo is really removed during the MC Lowering.
+ return true;
+ }
+ case X86::IRET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+ // Replace pseudo with machine iret
+ BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::EH_RESTORE: {
+ // Restore ESP and EBP, and optionally ESI if required.
+ bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
+ MBB.getParent()->getFunction()->getPersonalityFn()));
+ X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
+ MBBI->eraseFromParent();
+ return true;
+ }
+ }
+ llvm_unreachable("Previous switch has a fallthrough?");
+}
+
+/// Expand all pseudo instructions contained in \p MBB.
+/// \returns true if any expansion occurred for \p MBB.
+bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ // MBBI may be invalidated by the expansion.
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= ExpandMI(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ X86FL = STI->getFrameLowering();
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= ExpandMBB(MBB);
+ return Modified;
+}
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createX86ExpandPseudoPass() {
+ return new X86ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..629d4d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,3607 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel final : public FastISel {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf64;
+ bool X86ScalarSSEf32;
+
+public:
+ explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo) {
+ Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ }
+
+ bool fastSelectInstruction(const Instruction *I) override;
+
+ /// \brief The specified machine instr operand is a vreg, and that
+ /// vreg is being provided by the specified load instruction. If possible,
+ /// try to fold the load as an operand to the instruction, returning true if
+ /// possible.
+ bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) override;
+
+ bool fastLowerArguments() override;
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+ bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+#include "X86GenFastISel.inc"
+
+private:
+ bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
+
+ bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+ unsigned &ResultReg, unsigned Alignment = 1);
+
+ bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+
+ bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+ unsigned &ResultReg);
+
+ bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+ bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+ bool X86SelectLoad(const Instruction *I);
+
+ bool X86SelectStore(const Instruction *I);
+
+ bool X86SelectRet(const Instruction *I);
+
+ bool X86SelectCmp(const Instruction *I);
+
+ bool X86SelectZExt(const Instruction *I);
+
+ bool X86SelectBranch(const Instruction *I);
+
+ bool X86SelectShift(const Instruction *I);
+
+ bool X86SelectDivRem(const Instruction *I);
+
+ bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
+ bool X86SelectSelect(const Instruction *I);
+
+ bool X86SelectTrunc(const Instruction *I);
+
+ bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
+ const TargetRegisterClass *RC);
+
+ bool X86SelectFPExt(const Instruction *I);
+ bool X86SelectFPTrunc(const Instruction *I);
+ bool X86SelectSIToFP(const Instruction *I);
+
+ const X86InstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+ const X86TargetMachine *getTargetMachine() const {
+ return static_cast<const X86TargetMachine *>(&TM);
+ }
+
+ bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
+ unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
+ unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
+ unsigned fastMaterializeConstant(const Constant *C) override;
+
+ unsigned fastMaterializeAlloca(const AllocaInst *C) override;
+
+ unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+ bool IsMemcpySmall(uint64_t Len);
+
+ bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len);
+
+ bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond);
+
+ const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+ X86AddressMode &AM);
+};
+
+} // end anonymous namespace.
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+ X86::CondCode CC = X86::COND_INVALID;
+ bool NeedSwap = false;
+ switch (Predicate) {
+ default: break;
+ // Floating-point Predicates
+ case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
+ case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
+ case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
+ case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
+ case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
+ case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
+ case CmpInst::FCMP_OEQ: // fall-through
+ case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+ // Integer Predicates
+ case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
+ case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
+ case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
+ case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
+ case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
+ case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
+ case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
+ case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+ unsigned CC;
+ bool NeedSwap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (Predicate) {
+ default: llvm_unreachable("Unexpected predicate");
+ case CmpInst::FCMP_OEQ: CC = 0; break;
+ case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OLT: CC = 1; break;
+ case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OLE: CC = 2; break;
+ case CmpInst::FCMP_UNO: CC = 3; break;
+ case CmpInst::FCMP_UNE: CC = 4; break;
+ case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_UGE: CC = 5; break;
+ case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_UGT: CC = 6; break;
+ case CmpInst::FCMP_ORD: CC = 7; break;
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_ONE: CC = 8; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Adds a complex addressing mode to the given machine instr builder.
+/// Note, this will constrain the index register. If its not possible to
+/// constrain the given index register, then a new one will be created. The
+/// IndexReg field of the addressing mode will be updated to match in this case.
+const MachineInstrBuilder &
+X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
+ X86AddressMode &AM) {
+ // First constrain the index register. It needs to be a GR64_NOSP.
+ AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
+ MIB->getNumOperands() +
+ X86::AddrIndexReg);
+ return ::addFullAddress(MIB, AM);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond) {
+ if (!isa<ExtractValueInst>(Cond))
+ return false;
+
+ const auto *EV = cast<ExtractValueInst>(Cond);
+ if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+ return false;
+
+ const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+ MVT RetVT;
+ const Function *Callee = II->getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+ if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return false;
+
+ X86::CondCode TmpCC;
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+ }
+
+ // Check if both instructions are in the same basic block.
+ if (II->getParent() != I->getParent())
+ return false;
+
+ // Make sure nothing is in the way
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
+ for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+ // We only expect extractvalue instructions between the intrinsic and the
+ // instruction to be selected.
+ if (!isa<ExtractValueInst>(Itr))
+ return false;
+
+ // Check that the extractvalue operand comes from the intrinsic.
+ const auto *EVI = cast<ExtractValueInst>(Itr);
+ if (EVI->getAggregateOperand() != II)
+ return false;
+ }
+
+ CC = TmpCC;
+ return true;
+}
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+ EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+ if (evt == MVT::Other || !evt.isSimple())
+ // Unhandled type. Halt "fast" selection and bail.
+ return false;
+
+ VT = evt.getSimpleVT();
+ // For now, require SSE/SSE2 for performing floating-point operations,
+ // since x87 requires additional work.
+ if (VT == MVT::f64 && !X86ScalarSSEf64)
+ return false;
+ if (VT == MVT::f32 && !X86ScalarSSEf32)
+ return false;
+ // Similarly, no f80 support yet.
+ if (VT == MVT::f80)
+ return false;
+ // We only handle legal types. For example, on x86-32 the instruction
+ // selector contains all of the 64-bit instructions from x86-64,
+ // under the assumption that i64 won't be used if the target doesn't
+ // support it.
+ return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
+ MachineMemOperand *MMO, unsigned &ResultReg,
+ unsigned Alignment) {
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = &X86::GR8RegClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = &X86::GR16RegClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ case MVT::v4f32:
+ if (Alignment >= 16)
+ Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm;
+ RC = &X86::VR128RegClass;
+ break;
+ case MVT::v2f64:
+ if (Alignment >= 16)
+ Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm;
+ RC = &X86::VR128RegClass;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (Alignment >= 16)
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm;
+ RC = &X86::VR128RegClass;
+ break;
+ }
+
+ ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ addFullAddress(MIB, AM);
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
+ bool HasSSE2 = Subtarget->hasSSE2();
+ bool HasSSE4A = Subtarget->hasSSE4A();
+ bool HasAVX = Subtarget->hasAVX();
+ bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+ // Get opcode and regclass of the output for the given store instruction.
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f80: // No f80 support yet.
+ default: return false;
+ case MVT::i1: {
+ // Mask out all but lowest bit.
+ unsigned AndResult = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::AND8ri), AndResult)
+ .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+ ValReg = AndResult;
+ }
+ // FALLTHROUGH, handling i1 as i8.
+ case MVT::i8: Opc = X86::MOV8mr; break;
+ case MVT::i16: Opc = X86::MOV16mr; break;
+ case MVT::i32:
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSS;
+ else
+ Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ } else
+ Opc = X86::ST_Fp32m;
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSD;
+ else
+ Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+ } else
+ Opc = X86::ST_Fp64m;
+ break;
+ case MVT::v4f32:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+ else
+ Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ } else
+ Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ break;
+ case MVT::v2f64:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+ else
+ Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ } else
+ Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+ else
+ Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ } else
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
+ break;
+ }
+
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+
+ return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Val))
+ Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
+
+ // If this is a store of a simple constant, fold the constant into the store.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ unsigned Opc = 0;
+ bool Signed = true;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
+ case MVT::i8: Opc = X86::MOV8mi; break;
+ case MVT::i16: Opc = X86::MOV16mi; break;
+ case MVT::i32: Opc = X86::MOV32mi; break;
+ case MVT::i64:
+ // Must be a 32-bit sign extended value.
+ if (isInt<32>(CI->getSExtValue()))
+ Opc = X86::MOV64mi32;
+ break;
+ }
+
+ if (Opc) {
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+ : CI->getZExtValue());
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return true;
+ }
+ }
+
+ unsigned ValReg = getRegForValue(Val);
+ if (ValReg == 0)
+ return false;
+
+ bool ValKill = hasTrivialKill(Val);
+ return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+ unsigned Src, EVT SrcVT,
+ unsigned &ResultReg) {
+ unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+ Src, /*TODO: Kill=*/false);
+ if (RR == 0)
+ return false;
+
+ ResultReg = RR;
+ return true;
+}
+
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // Can't handle TLS yet.
+ if (GV->isThreadLocal())
+ return false;
+
+ // RIP-relative addresses can't have additional register operands, so if
+ // we've already folded stuff into the addressing mode, just force the
+ // global value into its own register, which we can use as the basereg.
+ if (!Subtarget->isPICStyleRIPRel() ||
+ (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+ // Okay, we've committed to selecting this global. Set up the address.
+ AM.GV = GV;
+
+ // Allow the subtarget to classify the global.
+ unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+ // If this reference is relative to the pic base, set it now.
+ if (isGlobalRelativeToPICBase(GVFlags)) {
+ // FIXME: How do we know Base.Reg is free??
+ AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ }
+
+ // Unless the ABI requires an extra load, return a direct reference to
+ // the global.
+ if (!isGlobalStubReference(GVFlags)) {
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ }
+ AM.GVOpFlags = GVFlags;
+ return true;
+ }
+
+ // Ok, we need to do a load from a stub. If we've already loaded from
+ // this stub, reuse the loaded pointer, otherwise emit the load now.
+ DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
+ unsigned LoadReg;
+ if (I != LocalValueMap.end() && I->second != 0) {
+ LoadReg = I->second;
+ } else {
+ // Issue load from stub.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ X86AddressMode StubAM;
+ StubAM.Base.Reg = AM.Base.Reg;
+ StubAM.GV = GV;
+ StubAM.GVOpFlags = GVFlags;
+
+ // Prepare for inserting code in the local-value area.
+ SavePoint SaveInsertPt = enterLocalValueArea();
+
+ if (TLI.getPointerTy(DL) == MVT::i64) {
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+
+ if (Subtarget->isPICStyleRIPRel())
+ StubAM.Base.Reg = X86::RIP;
+ } else {
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ }
+
+ LoadReg = createResultReg(RC);
+ MachineInstrBuilder LoadMI =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
+ addFullAddress(LoadMI, StubAM);
+
+ // Ok, back to normal mode.
+ leaveLocalValueArea(SaveInsertPt);
+
+ // Prevent loading GV stub multiple times in same MBB.
+ LocalValueMap[V] = LoadReg;
+ }
+
+ // Now construct the final address. Note that the Disp, Scale,
+ // and Index values may already be set here.
+ AM.Base.Reg = LoadReg;
+ AM.GV = nullptr;
+ return true;
+ }
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+ SmallVector<const Value *, 32> GEPs;
+redo_gep:
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ // Don't walk into other basic blocks; it's possible we haven't
+ // visited them yet, so the instructions may not yet be assigned
+ // virtual registers.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::Alloca: {
+ // Do static allocas.
+ const AllocaInst *A = cast<AllocaInst>(V);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(A);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = SI->second;
+ return true;
+ }
+ break;
+ }
+
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+ uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+ // They have to fit in the 32-bit signed displacement field though.
+ if (isInt<32>(Disp)) {
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM);
+ }
+ }
+ break;
+ }
+
+ case Instruction::GetElementPtr: {
+ X86AddressMode SavedAM = AM;
+
+ // Pattern-match simple GEPs.
+ uint64_t Disp = (int32_t)AM.Disp;
+ unsigned IndexReg = AM.IndexReg;
+ unsigned Scale = AM.Scale;
+ gep_type_iterator GTI = gep_type_begin(U);
+ // Iterate through the indices, folding what we can. Constants can be
+ // folded, and one dynamic index can be handled, if the scale is supported.
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+ continue;
+ }
+
+ // A array/variable index is always of the form i*S where S is the
+ // constant scale size. See if we can push the scale into immediates.
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ Disp += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ Disp += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ if (IndexReg == 0 &&
+ (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+ (S == 1 || S == 2 || S == 4 || S == 8)) {
+ // Scaled-index addressing.
+ Scale = S;
+ IndexReg = getRegForGEPIndex(Op).first;
+ if (IndexReg == 0)
+ return false;
+ break;
+ }
+ // Unsupported.
+ goto unsupported_gep;
+ }
+ }
+
+ // Check for displacement overflow.
+ if (!isInt<32>(Disp))
+ break;
+
+ AM.IndexReg = IndexReg;
+ AM.Scale = Scale;
+ AM.Disp = (uint32_t)Disp;
+ GEPs.push_back(V);
+
+ if (const GetElementPtrInst *GEP =
+ dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+ // Ok, the GEP indices were covered by constant-offset and scaled-index
+ // addressing. Update the address state and move on to examining the base.
+ V = GEP;
+ goto redo_gep;
+ } else if (X86SelectAddress(U->getOperand(0), AM)) {
+ return true;
+ }
+
+ // If we couldn't merge the gep value into this addr mode, revert back to
+ // our address and just match the value instead of completely failing.
+ AM = SavedAM;
+
+ for (SmallVectorImpl<const Value *>::reverse_iterator
+ I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
+ if (handleConstantAddresses(*I, AM))
+ return true;
+
+ return false;
+ unsupported_gep:
+ // Ok, the GEP indices weren't all covered.
+ break;
+ }
+ }
+
+ return handleConstantAddresses(V, AM);
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ const Instruction *I = dyn_cast<Instruction>(V);
+ // Record if the value is defined in the same basic block.
+ //
+ // This information is crucial to know whether or not folding an
+ // operand is valid.
+ // Indeed, FastISel generates or reuses a virtual register for all
+ // operands of all instructions it selects. Obviously, the definition and
+ // its uses must use the same virtual register otherwise the produced
+ // code is incorrect.
+ // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+ // registers for values that are alive across basic blocks. This ensures
+ // that the values are consistently set between across basic block, even
+ // if different instruction selection mechanisms are used (e.g., a mix of
+ // SDISel and FastISel).
+ // For values local to a basic block, the instruction selection process
+ // generates these virtual registers with whatever method is appropriate
+ // for its needs. In particular, FastISel and SDISel do not share the way
+ // local virtual registers are set.
+ // Therefore, this is impossible (or at least unsafe) to share values
+ // between basic blocks unless they use the same instruction selection
+ // method, which is not guarantee for X86.
+ // Moreover, things like hasOneUse could not be used accurately, if we
+ // allow to reference values across basic blocks whereas they are not
+ // alive across basic blocks initially.
+ bool InMBB = true;
+ if (I) {
+ Opcode = I->getOpcode();
+ U = I;
+ InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts if its operand is in the same BB.
+ if (InMBB)
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs if its operand is in the same BB.
+ if (InMBB &&
+ TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints if its operand is in the same BB.
+ if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (Subtarget->isPICStyleRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle DLL Import.
+ if (GV->hasDLLImportStorageClass())
+ return false;
+
+ // Can't handle TLS.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Okay, we've committed to selecting this global. Set up the basic address.
+ AM.GV = GV;
+
+ // No ABI requires an extra load for anything other than DLLImport, which
+ // we rejected above. Return a direct reference to the global.
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+ } else if (Subtarget->isPICStyleGOT()) {
+ AM.GVOpFlags = X86II::MO_GOTOFF;
+ }
+
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+ // Atomic stores need special handling.
+ const StoreInst *S = cast<StoreInst>(I);
+
+ if (S->isAtomic())
+ return false;
+
+ const Value *Val = S->getValueOperand();
+ const Value *Ptr = S->getPointerOperand();
+
+ MVT VT;
+ if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ unsigned Alignment = S->getAlignment();
+ unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = ABIAlignment;
+ bool Aligned = Alignment >= ABIAlignment;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+ const X86MachineFunctionInfo *X86MFInfo =
+ FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC != CallingConv::C &&
+ CC != CallingConv::Fast &&
+ CC != CallingConv::X86_FastCall &&
+ CC != CallingConv::X86_64_SysV)
+ return false;
+
+ if (Subtarget->isCallingConvWin64(CC))
+ return false;
+
+ // Don't handle popping bytes on return for now.
+ if (X86MFInfo->getBytesToPopOnReturn() != 0)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (F.isVarArg())
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ const Value *RV = Ret->getOperand(0);
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+
+ // Don't bother handling odd stuff for now.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ return false;
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ // The calling-convention tables for x87 returns don't tell
+ // the whole story.
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ EVT SrcVT = TLI.getValueType(DL, RV->getType());
+ EVT DstVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (SrcVT != DstVT) {
+ if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+ return false;
+
+ if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+ return false;
+
+ assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+ if (SrcVT == MVT::i1) {
+ if (Outs[0].Flags.isSExt())
+ return false;
+ SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+ }
+ unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+ ISD::SIGN_EXTEND;
+ SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+ SrcReg, /*TODO: Kill=*/false);
+ }
+
+ // Make the copy.
+ unsigned DstReg = VA.getLocReg();
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!SrcRC->contains(DstReg))
+ return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ if (F.hasStructRetAttr()) {
+ unsigned Reg = X86MFInfo->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments()!");
+ unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
+ RetRegs.push_back(RetReg);
+ }
+
+ // Now emit the RET.
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+ const LoadInst *LI = cast<LoadInst>(I);
+
+ // Atomic loads need special handling.
+ if (LI->isAtomic())
+ return false;
+
+ MVT VT;
+ if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ const Value *Ptr = LI->getPointerOperand();
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ unsigned Alignment = LI->getAlignment();
+ unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = ABIAlignment;
+
+ unsigned ResultReg = 0;
+ if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
+ Alignment))
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+ bool HasAVX = Subtarget->hasAVX();
+ bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+ bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8: return X86::CMP8rr;
+ case MVT::i16: return X86::CMP16rr;
+ case MVT::i32: return X86::CMP32rr;
+ case MVT::i64: return X86::CMP64rr;
+ case MVT::f32:
+ return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+ case MVT::f64:
+ return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+ }
+}
+
+/// If we have a comparison with RHS as the RHS of the comparison, return an
+/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+ int64_t Val = RHSC->getSExtValue();
+ switch (VT.getSimpleVT().SimpleTy) {
+ // Otherwise, we can't fold the immediate into this comparison.
+ default:
+ return 0;
+ case MVT::i8:
+ return X86::CMP8ri;
+ case MVT::i16:
+ if (isInt<8>(Val))
+ return X86::CMP16ri8;
+ return X86::CMP16ri;
+ case MVT::i32:
+ if (isInt<8>(Val))
+ return X86::CMP32ri8;
+ return X86::CMP32ri;
+ case MVT::i64:
+ if (isInt<8>(Val))
+ return X86::CMP64ri8;
+ // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+ // field.
+ if (isInt<32>(Val))
+ return X86::CMP64ri32;
+ return 0;
+ }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
+ EVT VT, DebugLoc CurDbgLoc) {
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (Op0Reg == 0) return false;
+
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Op1))
+ Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
+
+ // We have two options: compare with register or immediate. If the RHS of
+ // the compare is an immediate that we can fold into this compare, use
+ // CMPri, otherwise use CMPrr.
+ if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
+ .addReg(Op0Reg)
+ .addImm(Op1C->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+ if (CompareOpc == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(Op1);
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
+ .addReg(Op0Reg)
+ .addReg(Op1Reg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ unsigned ResultReg = 0;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: {
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+ ResultReg);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+ break;
+ }
+ case CmpInst::FCMP_TRUE: {
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ ResultReg).addImm(1);
+ break;
+ }
+ }
+
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ const Value *LHS = CI->getOperand(0);
+ const Value *RHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+ if (RHSC && RHSC->isNullValue())
+ RHS = LHS;
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static unsigned SETFOpcTable[2][3] = {
+ { X86::SETEr, X86::SETNPr, X86::AND8rr },
+ { X86::SETNEr, X86::SETPr, X86::OR8rr }
+ };
+ unsigned *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+ case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+ }
+
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ if (SETFOpc) {
+ if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+ return false;
+
+ unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+ unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+ FlagReg1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+ FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+ ResultReg).addReg(FlagReg1).addReg(FlagReg2);
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+ unsigned Opc = X86::getSETFromCond(CC);
+
+ if (SwapArgs)
+ std::swap(LHS, RHS);
+
+ // Emit a compare of LHS/RHS.
+ if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (!TLI.isTypeLegal(DstVT))
+ return false;
+
+ unsigned ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0)
+ return false;
+
+ // Handle zero-extension from i1 to i8, which is common.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT.SimpleTy == MVT::i1) {
+ // Set the high bits to zero.
+ ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+
+ if (ResultReg == 0)
+ return false;
+ }
+
+ if (DstVT == MVT::i64) {
+ // Handle extension to 64-bits via sub-register shenanigans.
+ unsigned MovInst;
+
+ switch (SrcVT.SimpleTy) {
+ case MVT::i8: MovInst = X86::MOVZX32rr8; break;
+ case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+ case MVT::i32: MovInst = X86::MOV32rr; break;
+ default: llvm_unreachable("Unexpected zext to i64 source type");
+ }
+
+ unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
+ .addReg(ResultReg);
+
+ ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
+ ResultReg)
+ .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+ } else if (DstVT != MVT::i8) {
+ ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+ ResultReg, /*Kill=*/true);
+ if (ResultReg == 0)
+ return false;
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+ // Unconditional branches are selected by tablegen-generated code.
+ // Handle a conditional branch.
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Fold the common case of a conditional branch with a comparison
+ // in the same block (values defined on other blocks may not have
+ // initialized registers).
+ X86::CondCode CC;
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+ EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
+ case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
+ }
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+ // 0.0.
+ // We don't have to materialize a zero constant for this case and can just
+ // use %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+ // code check. Instead two branch instructions are required to check all
+ // the flags. First we change the predicate to a supported condition code,
+ // which will be the first branch. Later one we will emit the second
+ // branch.
+ bool NeedExtraBranch = false;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ std::swap(TrueMBB, FalseMBB); // fall-through
+ case CmpInst::FCMP_UNE:
+ NeedExtraBranch = true;
+ Predicate = CmpInst::FCMP_ONE;
+ break;
+ }
+
+ bool SwapArgs;
+ unsigned BranchOpc;
+ std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ BranchOpc = X86::GetCondBranchFromCond(CC);
+ if (SwapArgs)
+ std::swap(CmpLHS, CmpRHS);
+
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+ .addMBB(TrueMBB);
+
+ // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+ // to UNE above).
+ if (NeedExtraBranch) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
+ .addMBB(TrueMBB);
+ }
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+ } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+ // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+ // typically happen for _Bool and C++ bools.
+ MVT SourceVT;
+ if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+ isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+ unsigned TestOpc = 0;
+ switch (SourceVT.SimpleTy) {
+ default: break;
+ case MVT::i8: TestOpc = X86::TEST8ri; break;
+ case MVT::i16: TestOpc = X86::TEST16ri; break;
+ case MVT::i32: TestOpc = X86::TEST32ri; break;
+ case MVT::i64: TestOpc = X86::TEST64ri32; break;
+ }
+ if (TestOpc) {
+ unsigned OpReg = getRegForValue(TI->getOperand(0));
+ if (OpReg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
+ .addReg(OpReg).addImm(1);
+
+ unsigned JmpOpc = X86::JNE_1;
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ JmpOpc = X86::JE_1;
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
+ .addMBB(TrueMBB);
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+ }
+ } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned TmpReg = getRegForValue(BI->getCondition());
+ if (TmpReg == 0)
+ return false;
+
+ unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+ .addMBB(TrueMBB);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+
+ // Otherwise do a clumsy setcc and re-test it.
+ // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+ // in an explicit cast, so make sure to handle that correctly.
+ unsigned OpReg = getRegForValue(BI->getCondition());
+ if (OpReg == 0) return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(OpReg).addImm(1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
+ .addMBB(TrueMBB);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+ unsigned CReg = 0, OpReg = 0;
+ const TargetRegisterClass *RC = nullptr;
+ if (I->getType()->isIntegerTy(8)) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(16)) {
+ CReg = X86::CX;
+ RC = &X86::GR16RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL16rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(32)) {
+ CReg = X86::ECX;
+ RC = &X86::GR32RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL32rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(64)) {
+ CReg = X86::RCX;
+ RC = &X86::GR64RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL64rCL; break;
+ default: return false;
+ }
+ } else {
+ return false;
+ }
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+ CReg).addReg(Op1Reg);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::KILL), X86::CL)
+ .addReg(CReg, RegState::Kill);
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
+ .addReg(Op0Reg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+ const static unsigned NumTypes = 4; // i8, i16, i32, i64
+ const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
+ const static bool S = true; // IsSigned
+ const static bool U = false; // !IsSigned
+ const static unsigned Copy = TargetOpcode::COPY;
+ // For the X86 DIV/IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended or zero-extended
+ // into highreg. The exception is i8, where the dividend is defined
+ // as a single register rather than a register pair, and we
+ // therefore directly sign-extend or zero-extend the dividend into
+ // lowreg, instead of copying, and ignore the highreg.
+ const static struct DivRemEntry {
+ // The following portion depends only on the data type.
+ const TargetRegisterClass *RC;
+ unsigned LowInReg; // low part of the register pair
+ unsigned HighInReg; // high part of the register pair
+ // The following portion depends on both the data type and the operation.
+ struct DivRemResult {
+ unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
+ unsigned OpSignExtend; // Opcode for sign-extending lowreg into
+ // highreg, or copying a zero into highreg.
+ unsigned OpCopy; // Opcode for copying dividend into lowreg, or
+ // zero/sign-extending into lowreg for i8.
+ unsigned DivRemResultReg; // Register containing the desired result.
+ bool IsOpSigned; // Whether to use signed or unsigned form.
+ } ResultTable[NumOps];
+ } OpTable[NumTypes] = {
+ { &X86::GR8RegClass, X86::AX, 0, {
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
+ }
+ }, // i8
+ { &X86::GR16RegClass, X86::AX, X86::DX, {
+ { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
+ { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
+ }
+ }, // i16
+ { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
+ }
+ }, // i32
+ { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+ { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
+ { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
+ }
+ }, // i64
+ };
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned TypeIndex, OpIndex;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8: TypeIndex = 0; break;
+ case MVT::i16: TypeIndex = 1; break;
+ case MVT::i32: TypeIndex = 2; break;
+ case MVT::i64: TypeIndex = 3;
+ if (!Subtarget->is64Bit())
+ return false;
+ break;
+ }
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected div/rem opcode");
+ case Instruction::SDiv: OpIndex = 0; break;
+ case Instruction::SRem: OpIndex = 1; break;
+ case Instruction::UDiv: OpIndex = 2; break;
+ case Instruction::URem: OpIndex = 3; break;
+ }
+
+ const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+ const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0)
+ return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0)
+ return false;
+
+ // Move op0 into low-order input register.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+ // Zero-extend or sign-extend into high-order input register.
+ if (OpEntry.OpSignExtend) {
+ if (OpEntry.IsOpSigned)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpSignExtend));
+ else {
+ unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::MOV32r0), Zero32);
+
+ // Copy the zero into the appropriate sub/super/identical physical
+ // register. Unfortunately the operations needed are not uniform enough
+ // to fit neatly into the table above.
+ if (VT.SimpleTy == MVT::i16) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32, 0, X86::sub_16bit);
+ } else if (VT.SimpleTy == MVT::i32) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32);
+ } else if (VT.SimpleTy == MVT::i64) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+ .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+ }
+ }
+ }
+ // Generate the DIV/IDIV instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+ // For i8 remainder, we can't reference AH directly, as we'll end
+ // up with bogus copies like %R9B = COPY %AH. Reference AX
+ // instead to prevent AH references in a REX instruction.
+ //
+ // The current assumption of the fast register allocator is that isel
+ // won't generate explicit references to the GPR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ unsigned ResultReg = 0;
+ if ((I->getOpcode() == Instruction::SRem ||
+ I->getOpcode() == Instruction::URem) &&
+ OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+ unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+ unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+ // Shift AX right by 8 bits instead of using AH.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
+ ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+ // Now reference the 8-bit subreg of the result.
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+ /*Kill=*/true, X86::sub_8bit);
+ }
+ // Copy the result out of the physreg if we haven't already.
+ if (!ResultReg) {
+ ResultReg = createResultReg(TypeEntry.RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
+ .addReg(OpEntry.DivRemResultReg);
+ }
+ updateValueMap(I, ResultReg);
+
+ return true;
+}
+
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+ // Check if the subtarget supports these instructions.
+ if (!Subtarget->hasCMov())
+ return false;
+
+ // FIXME: Add support for i8.
+ if (RetVT < MVT::i16 || RetVT > MVT::i64)
+ return false;
+
+ const Value *Cond = I->getOperand(0);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ bool NeedTest = true;
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static unsigned SETFOpcTable[2][3] = {
+ { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+ { X86::SETPr, X86::SETNEr, X86::OR8rr }
+ };
+ unsigned *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ SETFOpc = &SETFOpcTable[0][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ case CmpInst::FCMP_UNE:
+ SETFOpc = &SETFOpcTable[1][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ }
+
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+ return false;
+
+ if (SETFOpc) {
+ unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+ unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+ FlagReg1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+ FlagReg2);
+ auto const &II = TII.get(SETFOpc[2]);
+ if (II.getNumDefs()) {
+ unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ } else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ }
+ }
+ NeedTest = false;
+ } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned TmpReg = getRegForValue(Cond);
+ if (TmpReg == 0)
+ return false;
+
+ NeedTest = false;
+ }
+
+ if (NeedTest) {
+ // Selects operate on i1, however, CondReg is 8 bits width and may contain
+ // garbage. Indeed, only the less significant bit is supposed to be
+ // accurate. If we read more than the lsb, we may see non-zero values
+ // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+ // the select. This is achieved by performing TEST against 1.
+ unsigned CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+/// \brief Emit SSE or AVX instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+ if (!CI || (CI->getParent() != I->getParent()))
+ return false;
+
+ if (I->getType() != CI->getOperand(0)->getType() ||
+ !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+ (Subtarget->hasSSE2() && RetVT == MVT::f64)))
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
+ unsigned CC;
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+ if (CC > 7)
+ return false;
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ // Choose the SSE instruction sequence based on data type (float or double).
+ static unsigned OpcTable[2][4] = {
+ { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr },
+ { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }
+ };
+
+ unsigned *Opc = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = &OpcTable[0][0]; break;
+ case MVT::f64: Opc = &OpcTable[1][0]; break;
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned CmpLHSReg = getRegForValue(CmpLHS);
+ bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+ unsigned CmpRHSReg = getRegForValue(CmpRHS);
+ bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+ if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned ResultReg;
+
+ if (Subtarget->hasAVX()) {
+ const TargetRegisterClass *FR32 = &X86::FR32RegClass;
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
+ // If we have AVX, create 1 blendv instead of 3 logic instructions.
+ // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+ unsigned CmpOpcode =
+ (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+ unsigned BlendOpcode =
+ (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+
+ unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CmpReg, true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
+ } else {
+ unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+ LHSReg, LHSIsKill);
+ unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+ RHSReg, RHSIsKill);
+ ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+ AndReg, /*IsKill=*/true);
+ }
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+ // These are pseudo CMOV instructions and will be later expanded into control-
+ // flow.
+ unsigned Opc;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::i8: Opc = X86::CMOV_GR8; break;
+ case MVT::i16: Opc = X86::CMOV_GR16; break;
+ case MVT::i32: Opc = X86::CMOV_GR32; break;
+ case MVT::f32: Opc = X86::CMOV_FR32; break;
+ case MVT::f64: Opc = X86::CMOV_FR64; break;
+ }
+
+ const Value *Cond = I->getOperand(0);
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+ if (CC > X86::LAST_VALID_COND)
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+ return false;
+ } else {
+ unsigned CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+ unsigned ResultReg =
+ fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ // Check if we can fold the select.
+ if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ const Value *Opnd = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+ case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
+ }
+ // No need for a select anymore - this is an unconditional move.
+ if (Opnd) {
+ unsigned OpReg = getRegForValue(Opnd);
+ if (OpReg == 0)
+ return false;
+ bool OpIsKill = hasTrivialKill(Opnd);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(OpReg, getKillRegState(OpIsKill));
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ // First try to use real conditional move instructions.
+ if (X86FastEmitCMoveSelect(RetVT, I))
+ return true;
+
+ // Try to use a sequence of SSE instructions to simulate a conditional move.
+ if (X86FastEmitSSESelect(RetVT, I))
+ return true;
+
+ // Fall-back to pseudo conditional move instructions, which will be later
+ // converted to control-flow.
+ if (X86FastEmitPseudoSelect(RetVT, I))
+ return true;
+
+ return false;
+}
+
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+ // The target-independent selection algorithm in FastISel already knows how
+ // to select a SINT_TO_FP if the target is SSE but not AVX.
+ // Early exit if the subtarget doesn't have AVX.
+ if (!Subtarget->hasAVX())
+ return false;
+
+ if (!I->getOperand(0)->getType()->isIntegerTy(32))
+ return false;
+
+ // Select integer to float/double conversion.
+ unsigned OpReg = getRegForValue(I->getOperand(0));
+ if (OpReg == 0)
+ return false;
+
+ const TargetRegisterClass *RC = nullptr;
+ unsigned Opcode;
+
+ if (I->getType()->isDoubleTy()) {
+ // sitofp int -> double
+ Opcode = X86::VCVTSI2SDrr;
+ RC = &X86::FR64RegClass;
+ } else if (I->getType()->isFloatTy()) {
+ // sitofp int -> float
+ Opcode = X86::VCVTSI2SSrr;
+ RC = &X86::FR32RegClass;
+ } else
+ return false;
+
+ unsigned ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ unsigned ResultReg =
+ fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
+bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
+ unsigned TargetOpc,
+ const TargetRegisterClass *RC) {
+ assert((I->getOpcode() == Instruction::FPExt ||
+ I->getOpcode() == Instruction::FPTrunc) &&
+ "Instruction must be an FPExt or FPTrunc!");
+
+ unsigned OpReg = getRegForValue(I->getOperand(0));
+ if (OpReg == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
+ ResultReg);
+ if (Subtarget->hasAVX())
+ MIB.addReg(OpReg);
+ MIB.addReg(OpReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+ if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+ I->getOperand(0)->getType()->isFloatTy()) {
+ // fpext from float to double.
+ unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+ return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+ if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+ I->getOperand(0)->getType()->isDoubleTy()) {
+ // fptrunc from double to float.
+ unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+ return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+
+ // This code only handles truncation to byte.
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
+ return false;
+ if (!TLI.isTypeLegal(SrcVT))
+ return false;
+
+ unsigned InputReg = getRegForValue(I->getOperand(0));
+ if (!InputReg)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return false;
+
+ if (SrcVT == MVT::i8) {
+ // Truncate from i8 to i1; no code needed.
+ updateValueMap(I, InputReg);
+ return true;
+ }
+
+ bool KillInputReg = false;
+ if (!Subtarget->is64Bit()) {
+ // If we're on x86-32; we can't extract an i8 from a general register.
+ // First issue a copy to GR16_ABCD or GR32_ABCD.
+ const TargetRegisterClass *CopyRC =
+ (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
+ unsigned CopyReg = createResultReg(CopyRC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
+ InputReg = CopyReg;
+ KillInputReg = true;
+ }
+
+ // Issue an extract_subreg.
+ unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+ InputReg, KillInputReg,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+ return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len) {
+
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!IsMemcpySmall(Len))
+ return false;
+
+ bool i64Legal = Subtarget->is64Bit();
+
+ // We don't care about alignment here since we just emit integer accesses.
+ while (Len) {
+ MVT VT;
+ if (Len >= 8 && i64Legal)
+ VT = MVT::i64;
+ else if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else
+ VT = MVT::i8;
+
+ unsigned Reg;
+ bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+ RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+ assert(RV && "Failed to emit load or store??");
+
+ unsigned Size = VT.getSizeInBits()/8;
+ Len -= Size;
+ DestAM.Disp += Size;
+ SrcAM.Disp += Size;
+ }
+
+ return true;
+}
+
+bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ // FIXME: Handle more intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::convert_from_fp16:
+ case Intrinsic::convert_to_fp16: {
+ if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
+ return false;
+
+ const Value *Op = II->getArgOperand(0);
+ unsigned InputReg = getRegForValue(Op);
+ if (InputReg == 0)
+ return false;
+
+ // F16C only allows converting from float to half and from half to float.
+ bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
+ if (IsFloatToHalf) {
+ if (!Op->getType()->isFloatTy())
+ return false;
+ } else {
+ if (!II->getType()->isFloatTy())
+ return false;
+ }
+
+ unsigned ResultReg = 0;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
+ if (IsFloatToHalf) {
+ // 'InputReg' is implicitly promoted from register class FR32 to
+ // register class VR128 by method 'constrainOperandRegClass' which is
+ // directly called by 'fastEmitInst_ri'.
+ // Instruction VCVTPS2PHrr takes an extra immediate operand which is
+ // used to provide rounding control.
+ InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0);
+
+ // Move the lower 32-bits of ResultReg to another register of class GR32.
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::VMOVPDI2DIrr), ResultReg)
+ .addReg(InputReg, RegState::Kill);
+
+ // The result value is in the lower 16-bits of ResultReg.
+ unsigned RegIdx = X86::sub_16bit;
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+ } else {
+ assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
+ // Explicitly sign-extend the input to 32-bit.
+ InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+ /*Kill=*/false);
+
+ // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
+ InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
+ InputReg, /*Kill=*/true);
+
+ InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+
+ // The result value is in the lower 32-bits of ResultReg.
+ // Emit an explicit copy from register class VR128 to register class FR32.
+ ResultReg = createResultReg(&X86::FR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(InputReg, RegState::Kill);
+ }
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::frameaddress: {
+ MachineFunction *MF = FuncInfo.MF;
+ if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
+ return false;
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC = nullptr;
+
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Invalid result type for frameaddress.");
+ case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+ case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+ }
+
+ // This needs to be set before we call getPtrSizedFrameRegister, otherwise
+ // we get the wrong frame register.
+ MachineFrameInfo *MFI = MF->getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+
+ // Always make a copy of the frame register to to a vreg first, so that we
+ // never directly reference the frame register (the TwoAddressInstruction-
+ // Pass doesn't like that).
+ unsigned SrcReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+ // Now recursively load from the frame address.
+ // movq (%rbp), %rax
+ // movq (%rax), %rax
+ // movq (%rax), %rax
+ // ...
+ unsigned DestReg;
+ unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+ while (Depth--) {
+ DestReg = createResultReg(RC);
+ addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), DestReg), SrcReg);
+ SrcReg = DestReg;
+ }
+
+ updateValueMap(II, SrcReg);
+ return true;
+ }
+ case Intrinsic::memcpy: {
+ const MemCpyInst *MCI = cast<MemCpyInst>(II);
+ // Don't handle volatile or variable length memcpys.
+ if (MCI->isVolatile())
+ return false;
+
+ if (isa<ConstantInt>(MCI->getLength())) {
+ // Small memcpy's are common enough that we want to do them
+ // without a call if possible.
+ uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+ if (IsMemcpySmall(Len)) {
+ X86AddressMode DestAM, SrcAM;
+ if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+ !X86SelectAddress(MCI->getRawSource(), SrcAM))
+ return false;
+ TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+ return true;
+ }
+ }
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
+ return false;
+
+ return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst *MSI = cast<MemSetInst>(II);
+
+ if (MSI->isVolatile())
+ return false;
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MSI->getDestAddressSpace() > 255)
+ return false;
+
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::stackprotector: {
+ // Emit code to store the stack guard onto the stack.
+ EVT PtrTy = TLI.getPointerTy(DL);
+
+ const Value *Op1 = II->getArgOperand(0); // The guard's value.
+ const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
+
+ MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
+ // Grab the frame index.
+ X86AddressMode AM;
+ if (!X86SelectAddress(Slot, AM)) return false;
+ if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+ return true;
+ }
+ case Intrinsic::dbg_declare: {
+ const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+ X86AddressMode AM;
+ assert(DI->getAddress() && "Null address should be checked earlier!");
+ if (!X86SelectAddress(DI->getAddress(), AM))
+ return false;
+ const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+ // FIXME may need to add RegState::Debug to any registers produced,
+ // although ESP/EBP should be the only ones at the moment.
+ assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+ "Expected inlined-at fields to agree");
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
+ .addImm(0)
+ .addMetadata(DI->getVariable())
+ .addMetadata(DI->getExpression());
+ return true;
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
+ return true;
+ }
+ case Intrinsic::sqrt: {
+ if (!Subtarget->hasSSE1())
+ return false;
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
+ // is not generated by FastISel yet.
+ // FIXME: Update this code once tablegen can handle it.
+ static const unsigned SqrtOpc[2][2] = {
+ {X86::SQRTSSr, X86::VSQRTSSr},
+ {X86::SQRTSDr, X86::VSQRTSDr}
+ };
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+ case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+ }
+
+ const Value *SrcVal = II->getArgOperand(0);
+ unsigned SrcReg = getRegForValue(SrcVal);
+
+ if (SrcReg == 0)
+ return false;
+
+ unsigned ImplicitDefReg = 0;
+ if (HasAVX) {
+ ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+
+ if (ImplicitDefReg)
+ MIB.addReg(ImplicitDefReg);
+
+ MIB.addReg(SrcReg);
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: {
+ // This implements the basic lowering of the xalu with overflow intrinsics
+ // into add/sub/mul followed by either seto or setb.
+ const Function *Callee = II->getCalledFunction();
+ auto *Ty = cast<StructType>(Callee->getReturnType());
+ Type *RetTy = Ty->getTypeAtIndex(0U);
+ Type *CondTy = Ty->getTypeAtIndex(1);
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ if (VT < MVT::i8 || VT > MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(II))
+ std::swap(LHS, RHS);
+
+ bool UseIncDec = false;
+ if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
+ UseIncDec = true;
+
+ unsigned BaseOpc, CondOpc;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::sadd_with_overflow:
+ BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
+ CondOpc = X86::SETOr;
+ break;
+ case Intrinsic::uadd_with_overflow:
+ BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+ case Intrinsic::ssub_with_overflow:
+ BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
+ CondOpc = X86::SETOr;
+ break;
+ case Intrinsic::usub_with_overflow:
+ BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+ case Intrinsic::smul_with_overflow:
+ BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+ case Intrinsic::umul_with_overflow:
+ BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+ }
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (LHSReg == 0)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned ResultReg = 0;
+ // Check if we have an immediate version.
+ if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
+ static const unsigned Opc[2][4] = {
+ { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
+ { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
+ };
+
+ if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
+ ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ bool IsDec = BaseOpc == X86ISD::DEC;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ } else
+ ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+ CI->getZExtValue());
+ }
+
+ unsigned RHSReg;
+ bool RHSIsKill;
+ if (!ResultReg) {
+ RHSReg = getRegForValue(RHS);
+ if (RHSReg == 0)
+ return false;
+ RHSIsKill = hasTrivialKill(RHS);
+ ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill);
+ }
+
+ // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+ // it manually.
+ if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+ static const unsigned MULOpc[] =
+ { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+ static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+ // First copy the first operand into RAX, which is an implicit input to
+ // the X86::MUL*r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+ } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+ static const unsigned MULOpc[] =
+ { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+ if (VT == MVT::i8) {
+ // Copy the first operand into AL, which is an implicit input to the
+ // X86::IMUL8r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::AL)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+ RHSIsKill);
+ } else
+ ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+ assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+ ResultReg2);
+
+ updateValueMap(II, ResultReg, 2);
+ return true;
+ }
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64: {
+ bool IsInputDouble;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic.");
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ IsInputDouble = false;
+ break;
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ if (!Subtarget->hasSSE2())
+ return false;
+ IsInputDouble = true;
+ break;
+ }
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ static const unsigned CvtOpc[2][2][2] = {
+ { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
+ { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
+ { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
+ { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
+ };
+ bool HasAVX = Subtarget->hasAVX();
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected result type.");
+ case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+ case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+ }
+
+ // Check if we can fold insertelement instructions into the convert.
+ const Value *Op = II->getArgOperand(0);
+ while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+ const Value *Index = IE->getOperand(2);
+ if (!isa<ConstantInt>(Index))
+ break;
+ unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+ if (Idx == 0) {
+ Op = IE->getOperand(1);
+ break;
+ }
+ Op = IE->getOperand(0);
+ }
+
+ unsigned Reg = getRegForValue(Op);
+ if (Reg == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(Reg);
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ }
+}
+
+bool X86FastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C)
+ return false;
+
+ if (Subtarget->isCallingConvWin64(CC))
+ return false;
+
+ if (!Subtarget->is64Bit())
+ return false;
+
+ // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+ unsigned GPRCnt = 0;
+ unsigned FPRCnt = 0;
+ unsigned Idx = 0;
+ for (auto const &Arg : F->args()) {
+ // The first argument is at index 1.
+ ++Idx;
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = Arg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(DL, ArgTy);
+ if (!ArgVT.isSimple()) return false;
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i32:
+ case MVT::i64:
+ ++GPRCnt;
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ ++FPRCnt;
+ break;
+ }
+
+ if (GPRCnt > 6)
+ return false;
+
+ if (FPRCnt > 8)
+ return false;
+ }
+
+ static const MCPhysReg GPR32ArgRegs[] = {
+ X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+ };
+ static const MCPhysReg GPR64ArgRegs[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+ };
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+
+ unsigned GPRIdx = 0;
+ unsigned FPRIdx = 0;
+ for (auto const &Arg : F->args()) {
+ MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+ unsigned SrcReg;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+ case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+ case MVT::f32: // fall-through
+ case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+ }
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ updateValueMap(&Arg, ResultReg);
+ }
+ return true;
+}
+
+static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
+ CallingConv::ID CC,
+ ImmutableCallSite *CS) {
+ if (Subtarget->is64Bit())
+ return 0;
+ if (Subtarget->getTargetTriple().isOSMSVCRT())
+ return 0;
+ if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::HiPE)
+ return 0;
+
+ if (CS)
+ if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
+ CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+ return 0;
+
+ return 4;
+}
+
+bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ auto &OutVals = CLI.OutVals;
+ auto &OutFlags = CLI.OutFlags;
+ auto &OutRegs = CLI.OutRegs;
+ auto &Ins = CLI.Ins;
+ auto &InRegs = CLI.InRegs;
+ CallingConv::ID CC = CLI.CallConv;
+ bool &IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ MCSymbol *Symbol = CLI.Symbol;
+
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CC);
+
+ // Handle only C, fastcc, and webkit_js calling conventions for now.
+ switch (CC) {
+ default: return false;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::WebKit_JS:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_64_Win64:
+ case CallingConv::X86_64_SysV:
+ break;
+ }
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ return false;
+
+ // Don't know how to handle Win64 varargs yet. Nothing special needed for
+ // x86-32. Special handling for x86-64 is implemented.
+ if (IsVarArg && IsWin64)
+ return false;
+
+ // Don't know about inalloca yet.
+ if (CLI.CS && CLI.CS->hasInAllocaArgument())
+ return false;
+
+ // Fast-isel doesn't know about callee-pop yet.
+ if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+ TM.Options.GuaranteedTailCallOpt))
+ return false;
+
+ SmallVector<MVT, 16> OutVTs;
+ SmallVector<unsigned, 16> ArgRegs;
+
+ // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+ // instruction. This is safe because it is common to all FastISel supported
+ // calling conventions on x86.
+ for (int i = 0, e = OutVals.size(); i != e; ++i) {
+ Value *&Val = OutVals[i];
+ ISD::ArgFlagsTy Flags = OutFlags[i];
+ if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+ if (CI->getBitWidth() < 32) {
+ if (Flags.isSExt())
+ Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
+ else
+ Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
+ }
+ }
+
+ // Passing bools around ends up doing a trunc to i1 and passing it.
+ // Codegen this as an argument + "and 1".
+ MVT VT;
+ auto *TI = dyn_cast<TruncInst>(Val);
+ unsigned ResultReg;
+ if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
+ (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+ TI->hasOneUse()) {
+ Value *PrevVal = TI->getOperand(0);
+ ResultReg = getRegForValue(PrevVal);
+
+ if (!ResultReg)
+ return false;
+
+ if (!isTypeLegal(PrevVal->getType(), VT))
+ return false;
+
+ ResultReg =
+ fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+ } else {
+ if (!isTypeLegal(Val->getType(), VT))
+ return false;
+ ResultReg = getRegForValue(Val);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ ArgRegs.push_back(ResultReg);
+ OutVTs.push_back(VT);
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+ .addImm(NumBytes).addImm(0);
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign const &VA = ArgLocs[i];
+ const Value *ArgVal = OutVals[VA.getValNo()];
+ MVT ArgVT = OutVTs[VA.getValNo()];
+
+ if (ArgVT == MVT::x86mmx)
+ return false;
+
+ unsigned ArgReg = ArgRegs[VA.getValNo()];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::ZExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::AExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+
+ assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::BCvt: {
+ ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+ /*TODO: Kill=*/false);
+ assert(ArgReg && "Failed to emit a bitcast!");
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::VExt:
+ // VExt has not been implemented, so this should be impossible to reach
+ // for now. However, fallback to Selection DAG isel once implemented.
+ return false;
+ case CCValAssign::AExtUpper:
+ case CCValAssign::SExtUpper:
+ case CCValAssign::ZExtUpper:
+ case CCValAssign::FPExt:
+ llvm_unreachable("Unexpected loc info!");
+ case CCValAssign::Indirect:
+ // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+ // support this.
+ return false;
+ }
+
+ if (VA.isRegLoc()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+ OutRegs.push_back(VA.getLocReg());
+ } else {
+ assert(VA.isMemLoc());
+
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ X86AddressMode AM;
+ AM.Base.Reg = RegInfo->getStackRegister();
+ AM.Disp = LocMemOffset;
+ ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+ unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+ if (Flags.isByVal()) {
+ X86AddressMode SrcAM;
+ SrcAM.Base.Reg = ArgReg;
+ if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+ return false;
+ } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+ // If this is a really simple value, emit this with the Value* version
+ // of X86FastEmitStore. If it isn't simple, we don't want to do this,
+ // as it can cause us to reevaluate the argument.
+ if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
+ return false;
+ } else {
+ bool ValIsKill = hasTrivialKill(ArgVal);
+ if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+ return false;
+ }
+ }
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (Subtarget->isPICStyleGOT()) {
+ unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
+ }
+
+ if (Is64Bit && IsVarArg && !IsWin64) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ X86::AL).addImm(NumXMMRegs);
+ }
+
+ // Materialize callee address in a register. FIXME: GV address can be
+ // handled with a CALLpcrel32 instead.
+ X86AddressMode CalleeAM;
+ if (!X86SelectCallAddress(Callee, CalleeAM))
+ return false;
+
+ unsigned CalleeOp = 0;
+ const GlobalValue *GV = nullptr;
+ if (CalleeAM.GV != nullptr) {
+ GV = CalleeAM.GV;
+ } else if (CalleeAM.Base.Reg != 0) {
+ CalleeOp = CalleeAM.Base.Reg;
+ } else
+ return false;
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ if (CalleeOp) {
+ // Register-indirect call.
+ unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
+ .addReg(CalleeOp);
+ } else {
+ // Direct call.
+ assert(GV && "Not a direct call");
+ unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+
+ // See if we need any target-specific flags on the GV operand.
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+ // external symbols most go through the PLT in PIC mode. If the symbol
+ // has hidden or protected visibility, or if it is static or local, then
+ // we don't need to use the PLT - we can directly call it.
+ if (Subtarget->isTargetELF() &&
+ TM.getRelocationModel() == Reloc::PIC_ &&
+ GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ !GV->isStrongDefinitionForLinker() &&
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+ if (Symbol)
+ MIB.addSym(Symbol, OpFlags);
+ else
+ MIB.addGlobalAddress(GV, 0, OpFlags);
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (Subtarget->isPICStyleGOT())
+ MIB.addReg(X86::EBX, RegState::Implicit);
+
+ if (Is64Bit && IsVarArg && !IsWin64)
+ MIB.addReg(X86::AL, RegState::Implicit);
+
+ // Add implicit physical register uses to the call.
+ for (auto Reg : OutRegs)
+ MIB.addReg(Reg, RegState::Implicit);
+
+ // Issue CALLSEQ_END
+ unsigned NumBytesForCalleeToPop =
+ computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+ .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
+
+ // Now handle call return values.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
+ CLI.RetTy->getContext());
+ CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getValVT();
+ unsigned CopyReg = ResultReg + i;
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ CopyReg = createResultReg(&X86::RFP80RegClass);
+ }
+
+ // Copy out the result.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+ InRegs.push_back(VA.getLocReg());
+
+ // Round the f80 to the right size, which also moves it to the appropriate
+ // xmm register. This is accomplished by storing the f80 value in memory
+ // and then loading it back.
+ if (CopyVT != VA.getValVT()) {
+ EVT ResVT = VA.getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc)), FI)
+ .addReg(CopyReg);
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg + i), FI);
+ }
+ }
+
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = RVLocs.size();
+ CLI.Call = MIB;
+
+ return true;
+}
+
+bool
+X86FastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default: break;
+ case Instruction::Load:
+ return X86SelectLoad(I);
+ case Instruction::Store:
+ return X86SelectStore(I);
+ case Instruction::Ret:
+ return X86SelectRet(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return X86SelectCmp(I);
+ case Instruction::ZExt:
+ return X86SelectZExt(I);
+ case Instruction::Br:
+ return X86SelectBranch(I);
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ return X86SelectShift(I);
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return X86SelectDivRem(I);
+ case Instruction::Select:
+ return X86SelectSelect(I);
+ case Instruction::Trunc:
+ return X86SelectTrunc(I);
+ case Instruction::FPExt:
+ return X86SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return X86SelectFPTrunc(I);
+ case Instruction::SIToFP:
+ return X86SelectSIToFP(I);
+ case Instruction::IntToPtr: // Deliberate fall-through.
+ case Instruction::PtrToInt: {
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (DstVT.bitsGT(SrcVT))
+ return X86SelectZExt(I);
+ if (DstVT.bitsLT(SrcVT))
+ return X86SelectTrunc(I);
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0) return false;
+ updateValueMap(I, Reg);
+ return true;
+ }
+ case Instruction::BitCast: {
+ // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+ if (!Subtarget->hasSSE2())
+ return false;
+
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+
+ if (!SrcVT.isSimple() || !DstVT.isSimple())
+ return false;
+
+ if (!SrcVT.is128BitVector() &&
+ !(Subtarget->hasAVX() && SrcVT.is256BitVector()))
+ return false;
+
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0)
+ return false;
+
+ // No instruction is needed for conversion. Reuse the register used by
+ // the fist operand.
+ updateValueMap(I, Reg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
+ if (VT > MVT::i64)
+ return 0;
+
+ uint64_t Imm = CI->getZExtValue();
+ if (Imm == 0) {
+ unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type");
+ case MVT::i1:
+ case MVT::i8:
+ return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+ X86::sub_8bit);
+ case MVT::i16:
+ return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+ X86::sub_16bit);
+ case MVT::i32:
+ return SrcReg;
+ case MVT::i64: {
+ unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+ .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+ return ResultReg;
+ }
+ }
+ }
+
+ unsigned Opc = 0;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type");
+ case MVT::i1: VT = MVT::i8; // fall-through
+ case MVT::i8: Opc = X86::MOV8ri; break;
+ case MVT::i16: Opc = X86::MOV16ri; break;
+ case MVT::i32: Opc = X86::MOV32ri; break;
+ case MVT::i64: {
+ if (isUInt<32>(Imm))
+ Opc = X86::MOV32ri;
+ else if (isInt<32>(Imm))
+ Opc = X86::MOV64ri32;
+ else
+ Opc = X86::MOV64ri;
+ break;
+ }
+ }
+ if (VT == MVT::i64 && Opc == X86::MOV32ri) {
+ unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
+ unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+ .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+ return ResultReg;
+ }
+ return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+}
+
+unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
+ if (CFP->isNullValue())
+ return fastMaterializeFloatZero(CFP);
+
+ // Can't handle alternate code models yet.
+ CodeModel::Model CM = TM.getCodeModel();
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return 0;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+ if (Align == 0) {
+ // Alignment of vector types. FIXME!
+ Align = DL.getTypeAllocSize(CFP->getType());
+ }
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ unsigned char OpFlag = 0;
+ if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOTOFF;
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ } else if (Subtarget->isPICStyleRIPRel() &&
+ TM.getCodeModel() == CodeModel::Small) {
+ PICBase = X86::RIP;
+ }
+
+ // Create the load from the constant pool.
+ unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
+ unsigned ResultReg = createResultReg(RC);
+
+ if (CM == CodeModel::Large) {
+ unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+ AddrReg)
+ .addConstantPoolIndex(CPI, 0, OpFlag);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg);
+ addDirectMem(MIB, AddrReg);
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+ MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return ResultReg;
+ }
+
+ addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg),
+ CPI, PICBase, OpFlag);
+ return ResultReg;
+}
+
+unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return 0;
+
+ // Materialize addresses with LEA/MOV instructions.
+ X86AddressMode AM;
+ if (X86SelectAddress(GV, AM)) {
+ // If the expression is just a basereg, then we're done, otherwise we need
+ // to emit an LEA.
+ if (AM.BaseType == X86AddressMode::RegBase &&
+ AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+ return AM.Base.Reg;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ if (TM.getRelocationModel() == Reloc::Static &&
+ TLI.getPointerTy(DL) == MVT::i64) {
+ // The displacement code could be more than 32 bits away so we need to use
+ // an instruction with a 64 bit immediate
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+ ResultReg)
+ .addGlobalAddress(GV);
+ } else {
+ unsigned Opc =
+ TLI.getPointerTy(DL) == MVT::i32
+ ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+ : X86::LEA64r;
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg), AM);
+ }
+ return ResultReg;
+ }
+ return 0;
+}
+
+unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple())
+ return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const auto *CI = dyn_cast<ConstantInt>(C))
+ return X86MaterializeInt(CI, VT);
+ else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return X86MaterializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return X86MaterializeGV(GV, VT);
+
+ return 0;
+}
+
+unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
+ // Fail on dynamic allocas. At this point, getRegForValue has already
+ // checked its CSE maps, so if we're here trying to handle a dynamic
+ // alloca, we're not going to succeed. X86SelectAddress has a
+ // check for dynamic allocas, because it's called directly from
+ // various places, but targetMaterializeAlloca also needs a check
+ // in order to avoid recursion between getRegForValue,
+ // X86SelectAddrss, and targetMaterializeAlloca.
+ if (!FuncInfo.StaticAllocaMap.count(C))
+ return 0;
+ assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(C, AM))
+ return 0;
+ unsigned Opc =
+ TLI.getPointerTy(DL) == MVT::i32
+ ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+ : X86::LEA64r;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+ unsigned ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+}
+
+unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
+ MVT VT;
+ if (!isTypeLegal(CF->getType(), VT))
+ return 0;
+
+ // Get opcode and regclass for the given zero.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = X86::FsFLD0SS;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp032;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = X86::FsFLD0SD;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp064;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
+ const Value *Ptr = LI->getPointerOperand();
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ const X86InstrInfo &XII = (const X86InstrInfo &)TII;
+
+ unsigned Size = DL.getTypeAllocSize(LI->getType());
+ unsigned Alignment = LI->getAlignment();
+
+ if (Alignment == 0) // Ensure that codegen never sees alignment 0
+ Alignment = DL.getABITypeAlignment(LI->getType());
+
+ SmallVector<MachineOperand, 8> AddrOps;
+ AM.getFullAddress(AddrOps);
+
+ MachineInstr *Result = XII.foldMemoryOperandImpl(
+ *FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+ /*AllowCommute=*/true);
+ if (!Result)
+ return false;
+
+ // The index register could be in the wrong register class. Unfortunately,
+ // foldMemoryOperandImpl could have commuted the instruction so its not enough
+ // to just look at OpNo + the offset to the index reg. We actually need to
+ // scan the instruction to find the index reg and see if its the correct reg
+ // class.
+ unsigned OperandNo = 0;
+ for (MachineInstr::mop_iterator I = Result->operands_begin(),
+ E = Result->operands_end(); I != E; ++I, ++OperandNo) {
+ MachineOperand &MO = *I;
+ if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
+ continue;
+ // Found the index reg, now try to rewrite it.
+ unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
+ MO.getReg(), OperandNo);
+ if (IndexReg == MO.getReg())
+ continue;
+ MO.setReg(IndexReg);
+ }
+
+ Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+ MI->eraseFromParent();
+ return true;
+}
+
+
+namespace llvm {
+ FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ return new X86FastISel(funcInfo, libInfo);
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 0000000..1dd69e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,410 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
+// When optimizing for size it replaces suitable LEAs with INC or DEC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+class FixupLEAPass : public MachineFunctionPass {
+ enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+ static char ID;
+ /// \brief Loop over all of the instructions in the basic block
+ /// replacing applicable instructions with LEA instructions,
+ /// where appropriate.
+ bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+ const char *getPassName() const override { return "X86 LEA Fixup"; }
+
+ /// \brief Given a machine register, look for the instruction
+ /// which writes it in the current basic block. If found,
+ /// try to replace it with an equivalent LEA instruction.
+ /// If replacement succeeds, then also process the newly created
+ /// instruction.
+ void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Given a memory access or LEA instruction
+ /// whose address mode uses a base and/or index register, look for
+ /// an opportunity to replace the instruction which sets the base or index
+ /// register with an equivalent LEA instruction.
+ void processInstruction(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on Silvermont try to replace it with an equivalent ADD instruction
+ void processInstructionForSLM(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+ /// and convert them to INC or DEC respectively.
+ bool fixupIncDec(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) const;
+
+ /// \brief Determine if an instruction references a machine register
+ /// and, if so, whether it reads or writes the register.
+ RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
+
+ /// \brief Step backwards through a basic block, looking
+ /// for an instruction which writes a register within
+ /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+ MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI);
+
+ /// \brief if an instruction can be converted to an
+ /// equivalent LEA, insert the new instruction into the basic block
+ /// and return a pointer to it. Otherwise, return zero.
+ MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const;
+
+public:
+ FixupLEAPass() : MachineFunctionPass(ID) {}
+
+ /// \brief Loop over all of the basic blocks,
+ /// replacing instructions by equivalent LEA instructions
+ /// if needed and when possible.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ MachineFunction *MF;
+ const X86InstrInfo *TII; // Machine instruction info.
+ bool OptIncDec;
+ bool OptLEA;
+};
+char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const {
+ MachineInstr *MI = MBBI;
+ MachineInstr *NewMI;
+ switch (MI->getOpcode()) {
+ case X86::MOV32rr:
+ case X86::MOV64rr: {
+ const MachineOperand &Src = MI->getOperand(1);
+ const MachineOperand &Dest = MI->getOperand(0);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(),
+ TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
+ : X86::LEA64r))
+ .addOperand(Dest)
+ .addOperand(Src)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0);
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (!MI->getOperand(2).isImm()) {
+ // convertToThreeAddress will call getImm()
+ // which requires isImm() to be true
+ return nullptr;
+ }
+ break;
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+ // if src1 != src2, then convertToThreeAddress will
+ // need to create a Virtual register, which we cannot do
+ // after register allocation.
+ return nullptr;
+ }
+ }
+ return TII->convertToThreeAddress(MFI, MBBI, nullptr);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+ MF = &Func;
+ const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+ OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA();
+
+ if (!OptLEA && !OptIncDec)
+ return false;
+
+ TII = ST.getInstrInfo();
+
+ DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+ // Process all basic blocks.
+ for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+ processBasicBlock(Func, I);
+ DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+ return true;
+}
+
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
+ RegUsageState RegUsage = RU_NotUsed;
+ MachineInstr *MI = I;
+
+ for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+ MachineOperand &opnd = MI->getOperand(i);
+ if (opnd.isReg() && opnd.getReg() == p.getReg()) {
+ if (opnd.isDef())
+ return RU_Write;
+ RegUsage = RU_Read;
+ }
+ }
+ return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ if (I == MFI->begin()) {
+ if (MFI->isPredecessor(&*MFI)) {
+ I = --MFI->end();
+ return true;
+ } else
+ return false;
+ }
+ --I;
+ return true;
+}
+
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ int InstrDistance = 1;
+ MachineBasicBlock::iterator CurInst;
+ static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+ CurInst = I;
+ bool Found;
+ Found = getPreviousInstr(CurInst, MFI);
+ while (Found && I != CurInst) {
+ if (CurInst->isCall() || CurInst->isInlineAsm())
+ break;
+ if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+ break; // too far back to make a difference
+ if (usesRegister(p, CurInst) == RU_Write) {
+ return CurInst;
+ }
+ InstrDistance += TII->getInstrLatency(
+ MF->getSubtarget().getInstrItineraryData(), CurInst);
+ Found = getPreviousInstr(CurInst, MFI);
+ }
+ return nullptr;
+}
+
+static inline bool isLEA(const int opcode) {
+ return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+ opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+}
+
+/// isLEASimpleIncOrDec - Does this LEA have one these forms:
+/// lea %reg, 1(%reg)
+/// lea %reg, -1(%reg)
+static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) {
+ unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg();
+ unsigned DstReg = LEA->getOperand(0).getReg();
+ unsigned AddrDispOp = 1 + X86::AddrDisp;
+ return SrcReg == DstReg &&
+ LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+ LEA->getOperand(AddrDispOp).isImm() &&
+ (LEA->getOperand(AddrDispOp).getImm() == 1 ||
+ LEA->getOperand(AddrDispOp).getImm() == -1);
+}
+
+bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) const {
+ MachineInstr *MI = I;
+ int Opcode = MI->getOpcode();
+ if (!isLEA(Opcode))
+ return false;
+
+ if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
+ int NewOpcode;
+ bool isINC = MI->getOperand(4).getImm() == 1;
+ switch (Opcode) {
+ case X86::LEA16r:
+ NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
+ break;
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
+ break;
+ case X86::LEA64r:
+ NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
+ break;
+ }
+
+ MachineInstr *NewMI =
+ BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1));
+ MFI->erase(I);
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ return true;
+ }
+ return false;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ // Process a load, store, or LEA instruction.
+ MachineInstr *MI = I;
+ int opcode = MI->getOpcode();
+ const MCInstrDesc &Desc = MI->getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
+ if (AddrOffset >= 0) {
+ AddrOffset += X86II::getOperandBias(Desc);
+ MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+ if (p.isReg() && p.getReg() != X86::ESP) {
+ seekLEAFixup(p, I, MFI);
+ }
+ MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ if (q.isReg() && q.getReg() != X86::ESP) {
+ seekLEAFixup(q, I, MFI);
+ }
+ }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+ if (MBI) {
+ MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+ if (NewMI) {
+ ++NumLEAs;
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+ // now to replace with an equivalent LEA...
+ DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+ MFI->erase(MBI);
+ MachineBasicBlock::iterator J =
+ static_cast<MachineBasicBlock::iterator>(NewMI);
+ processInstruction(J, MFI);
+ }
+ }
+}
+
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) {
+ MachineInstr *MI = I;
+ const int opcode = MI->getOpcode();
+ if (!isLEA(opcode))
+ return;
+ if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+ !TII->isSafeToClobberEFLAGS(*MFI, I))
+ return;
+ const unsigned DstR = MI->getOperand(0).getReg();
+ const unsigned SrcR1 = MI->getOperand(1).getReg();
+ const unsigned SrcR2 = MI->getOperand(3).getReg();
+ if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+ return;
+ if (MI->getOperand(2).getImm() > 1)
+ return;
+ int addrr_opcode, addri_opcode;
+ switch (opcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ addrr_opcode = X86::ADD16rr;
+ addri_opcode = X86::ADD16ri;
+ break;
+ case X86::LEA32r:
+ addrr_opcode = X86::ADD32rr;
+ addri_opcode = X86::ADD32ri;
+ break;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ addrr_opcode = X86::ADD64rr;
+ addri_opcode = X86::ADD64ri32;
+ break;
+ }
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ MachineInstr *NewMI = nullptr;
+ const MachineOperand &Dst = MI->getOperand(0);
+ // Make ADD instruction for two registers writing to LEA's destination
+ if (SrcR1 != 0 && SrcR2 != 0) {
+ const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+ const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+ .addOperand(Dst)
+ .addOperand(Src1)
+ .addOperand(Src2);
+ MFI->insert(I, NewMI);
+ DEBUG(NewMI->dump(););
+ }
+ // Make ADD instruction for immediate
+ if (MI->getOperand(4).getImm() != 0) {
+ const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+ .addOperand(Dst)
+ .addOperand(SrcR)
+ .addImm(MI->getOperand(4).getImm());
+ MFI->insert(I, NewMI);
+ DEBUG(NewMI->dump(););
+ }
+ if (NewMI) {
+ MFI->erase(I);
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+ MachineFunction::iterator MFI) {
+
+ for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+ if (OptIncDec)
+ if (fixupIncDec(I, MFI))
+ continue;
+
+ if (OptLEA) {
+ if (MF.getSubtarget<X86Subtarget>().isSLM())
+ processInstructionForSLM(I, MFI);
+ else
+ processInstruction(I, MFI);
+ }
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..97bb8ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1651 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <bitset>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-codegen"
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ const unsigned ScratchFPReg = 7;
+
+ struct FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass(ID) {
+ initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+ // This is really only to keep valgrind quiet.
+ // The logic in isLive() is too much for it.
+ memset(Stack, 0, sizeof(Stack));
+ memset(RegMap, 0, sizeof(RegMap));
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<EdgeBundles>();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override { return "X86 FP Stackifier"; }
+
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+
+ // Two CFG edges are related if they leave the same block, or enter the same
+ // block. The transitive closure of an edge under this relation is a
+ // LiveBundle. It represents a set of CFG edges where the live FP stack
+ // registers must be allocated identically in the x87 stack.
+ //
+ // A LiveBundle is usually all the edges leaving a block, or all the edges
+ // entering a block, but it can contain more edges if critical edges are
+ // present.
+ //
+ // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+ // but the exact mapping of FP registers to stack slots is fixed later.
+ struct LiveBundle {
+ // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+ unsigned Mask;
+
+ // Number of pre-assigned live registers in FixStack. This is 0 when the
+ // stack order has not yet been fixed.
+ unsigned FixCount;
+
+ // Assigned stack order for live-in registers.
+ // FixStack[i] == getStackEntry(i) for all i < FixCount.
+ unsigned char FixStack[8];
+
+ LiveBundle() : Mask(0), FixCount(0) {}
+
+ // Have the live registers been assigned a stack order yet?
+ bool isFixed() const { return !Mask || FixCount; }
+ };
+
+ // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+ // with no live FP registers.
+ SmallVector<LiveBundle, 8> LiveBundles;
+
+ // The edge bundle analysis provides indices into the LiveBundles vector.
+ EdgeBundles *Bundles;
+
+ // Return a bitmask of FP registers in block's live-in list.
+ static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+ unsigned Mask = 0;
+ for (const auto &LI : MBB->liveins()) {
+ if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
+ continue;
+ Mask |= 1 << (LI.PhysReg - X86::FP0);
+ }
+ return Mask;
+ }
+
+ // Partition all the CFG edges into LiveBundles.
+ void bundleCFG(MachineFunction &MF);
+
+ MachineBasicBlock *MBB; // Current basic block
+
+ // The hardware keeps track of how many FP registers are live, so we have
+ // to model that exactly. Usually, each live register corresponds to an
+ // FP<n> register, but when dealing with calls, returns, and inline
+ // assembly, it is sometimes necessary to have live scratch registers.
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned StackTop; // The current top of the FP stack.
+
+ enum {
+ NumFPRegs = 8 // Including scratch pseudo-registers.
+ };
+
+ // For each live FP<n> register, point to its Stack[] entry.
+ // The first entries correspond to FP0-FP6, the rest are scratch registers
+ // used when we need slightly different live registers than what the
+ // register allocator thinks.
+ unsigned RegMap[NumFPRegs];
+
+ // Set up our stack model to match the incoming registers to MBB.
+ void setupBlockStack();
+
+ // Shuffle live registers to match the expectations of successor blocks.
+ void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dumpStack() const {
+ dbgs() << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ dbgs() << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ }
+#endif
+
+ /// getSlot - Return the stack slot number a particular register number is
+ /// in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < NumFPRegs && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ /// isLive - Is RegNo currently live in the stack?
+ bool isLive(unsigned RegNo) const {
+ unsigned Slot = getSlot(RegNo);
+ return Slot < StackTop && Stack[Slot] == RegNo;
+ }
+
+ /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ if (STi >= StackTop)
+ report_fatal_error("Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ /// getSTReg - Return the X86::ST(i) register which contains the specified
+ /// FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < NumFPRegs && "Register number out of range!");
+ if (StackTop >= 8)
+ report_fatal_error("Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ if (isAtTop(RegNo)) return;
+
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ if (RegMap[RegOnTop] >= StackTop)
+ report_fatal_error("Access past stack top!");
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ ++NumFXCH;
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ /// popStackAfter - Pop the current value off of the top of the FP stack
+ /// after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ /// freeStackSlotAfter - Free the specified register from the register
+ /// stack, so that it is no longer in a register. If the register is
+ /// currently at the top of the stack, we just pop the current instruction,
+ /// otherwise we store the current top-of-stack into the specified slot,
+ /// then pop the top of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+ /// instruction.
+ MachineBasicBlock::iterator
+ freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+ /// Adjust the live registers to be the set in Mask.
+ void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+ /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+ /// st(0), FP reg FixStack[1] is st(1) etc.
+ void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+ MachineBasicBlock::iterator I);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleCall(MachineBasicBlock::iterator &I);
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+ // Check if a COPY instruction is using FP registers.
+ static bool isFPCopy(MachineInstr *MI) {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+
+ return X86::RFP80RegClass.contains(DstReg) ||
+ X86::RFP80RegClass.contains(SrcReg);
+ }
+
+ void setKillFlags(MachineBasicBlock &MBB) const;
+ };
+ char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isReg() && "Expected an FP register!");
+ unsigned Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (unsigned i = 0; i <= 6; ++i)
+ if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ Bundles = &getAnalysis<EdgeBundles>();
+ TII = MF.getSubtarget().getInstrInfo();
+
+ // Prepare cross-MBB liveness.
+ bundleCFG(MF);
+
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ SmallPtrSet<MachineBasicBlock*, 8> Processed;
+ MachineBasicBlock *Entry = &MF.front();
+
+ bool Changed = false;
+ for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
+ Changed |= processBasicBlock(MF, *BB);
+
+ // Process any unreachable blocks in arbitrary order now.
+ if (MF.size() != Processed.size())
+ for (MachineBasicBlock &BB : MF)
+ if (Processed.insert(&BB).second)
+ Changed |= processBasicBlock(MF, BB);
+
+ LiveBundles.clear();
+
+ return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+ assert(LiveBundles.empty() && "Stale data in LiveBundles");
+ LiveBundles.resize(Bundles->getNumBundles());
+
+ // Gather the actual live-in masks for all MBBs.
+ for (MachineBasicBlock &MBB : MF) {
+ const unsigned Mask = calcLiveInMask(&MBB);
+ if (!Mask)
+ continue;
+ // Update MBB ingoing bundle mask.
+ LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
+ }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+
+ setKillFlags(BB);
+ setupBlockStack();
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr *MI = I;
+ uint64_t Flags = MI->getDesc().TSFlags;
+
+ unsigned FPInstClass = Flags & X86II::FPTypeMask;
+ if (MI->isInlineAsm())
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI->isCopy() && isFPCopy(MI))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI->isImplicitDef() &&
+ X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI->isCall())
+ FPInstClass = X86II::SpecialFP;
+
+ if (FPInstClass == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = nullptr;
+ if (I != BB.begin())
+ PrevMI = std::prev(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (FPInstClass) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: llvm_unreachable("Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ // Check if Reg is live on the stack. An inline-asm register operand that
+ // is in the clobber list and marked dead might not be live on the stack.
+ if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
+ DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ DEBUG(
+ MachineBasicBlock::iterator PrevI(PrevMI);
+ if (I == PrevI) {
+ dbgs() << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && std::prev(Start) != PrevI) --Start;
+ dbgs() << "Inserted instructions:\n\t";
+ Start->print(dbgs());
+ while (++Start != std::next(I)) {}
+ }
+ dumpStack();
+ );
+ (void)PrevMI;
+
+ Changed = true;
+ }
+
+ finishBlockStack();
+
+ return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+ DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+ << " derived from " << MBB->getName() << ".\n");
+ StackTop = 0;
+ // Get the live-in bundle for MBB.
+ const LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+ if (!Bundle.Mask) {
+ DEBUG(dbgs() << "Block has no FP live-ins.\n");
+ return;
+ }
+
+ // Depth-first iteration should ensure that we always have an assigned stack.
+ assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+ // Push the fixed live-in registers.
+ for (unsigned i = Bundle.FixCount; i > 0; --i) {
+ MBB->addLiveIn(X86::ST0+i-1);
+ DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+ << unsigned(Bundle.FixStack[i-1]) << '\n');
+ pushReg(Bundle.FixStack[i-1]);
+ }
+
+ // Kill off unwanted live-ins. This can happen with a critical edge.
+ // FIXME: We could keep these live registers around as zombies. They may need
+ // to be revived at the end of a short block. It might save a few instrs.
+ adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+ DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+ // The RET handling below takes care of return blocks for us.
+ if (MBB->succ_empty())
+ return;
+
+ DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+ << " derived from " << MBB->getName() << ".\n");
+
+ // Get MBB's live-out bundle.
+ unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+ LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+ // We may need to kill and define some registers to match successors.
+ // FIXME: This can probably be combined with the shuffle below.
+ MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+ adjustLiveRegs(Bundle.Mask, Term);
+
+ if (!Bundle.Mask) {
+ DEBUG(dbgs() << "No live-outs.\n");
+ return;
+ }
+
+ // Has the stack order been fixed yet?
+ DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+ if (Bundle.isFixed()) {
+ DEBUG(dbgs() << "Shuffling stack to match.\n");
+ shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+ } else {
+ // Not fixed yet, we get to choose.
+ DEBUG(dbgs() << "Fixing stack order now.\n");
+ Bundle.FixCount = StackTop;
+ for (unsigned i = 0; i < StackTop; ++i)
+ Bundle.FixStack[i] = getStackEntry(i);
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ uint16_t from;
+ uint16_t to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+ const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+ if (I != Table.end() && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ABS_Fp80 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m64 , X86::ADD_F64m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CHS_Fp80 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVB_Fp80 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVE_Fp80 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::CMOVP_Fp80 , X86::CMOVP_F },
+ { X86::COS_Fp32 , X86::COS_F },
+ { X86::COS_Fp64 , X86::COS_F },
+ { X86::COS_Fp80 , X86::COS_F },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m64 , X86::DIV_F64m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp16m80 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp32m80 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ILD_Fp64m80 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp16m80 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp32m80 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::IST_Fp64m80 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp080 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp180 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp32m64 , X86::LD_F32m },
+ { X86::LD_Fp32m80 , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::LD_Fp64m80 , X86::LD_F64m },
+ { X86::LD_Fp80m , X86::LD_F80m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m64 , X86::MUL_F64m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+ { X86::SIN_Fp32 , X86::SIN_F },
+ { X86::SIN_Fp64 , X86::SIN_F },
+ { X86::SIN_Fp80 , X86::SIN_F },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::SQRT_Fp80 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::ST_Fp80m32 , X86::ST_F32m },
+ { X86::ST_Fp80m64 , X86::ST_F64m },
+ { X86::ST_FpP80m , X86::ST_FP80m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m64 , X86::SUB_F64m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::TST_Fp80 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr80 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr80 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ MachineInstr* MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ ASSERT_SORTED(PopTable);
+ if (StackTop == 0)
+ report_fatal_error("Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, I->getOpcode());
+ if (Opcode != -1) {
+ I->setDesc(TII->get(Opcode));
+ if (Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
+ .addReg(STReg)
+ .getInstr();
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+ unsigned Defs = Mask;
+ unsigned Kills = 0;
+ for (unsigned i = 0; i < StackTop; ++i) {
+ unsigned RegNo = Stack[i];
+ if (!(Defs & (1 << RegNo)))
+ // This register is live, but we don't want it.
+ Kills |= (1 << RegNo);
+ else
+ // We don't need to imp-def this live register.
+ Defs &= ~(1 << RegNo);
+ }
+ assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+ // Produce implicit-defs for free by using killed registers.
+ while (Kills && Defs) {
+ unsigned KReg = countTrailingZeros(Kills);
+ unsigned DReg = countTrailingZeros(Defs);
+ DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+ std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+ std::swap(RegMap[KReg], RegMap[DReg]);
+ Kills &= ~(1 << KReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Kill registers by popping.
+ if (Kills && I != MBB->begin()) {
+ MachineBasicBlock::iterator I2 = std::prev(I);
+ while (StackTop) {
+ unsigned KReg = getStackEntry(0);
+ if (!(Kills & (1 << KReg)))
+ break;
+ DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+ popStackAfter(I2);
+ Kills &= ~(1 << KReg);
+ }
+ }
+
+ // Manually kill the rest.
+ while (Kills) {
+ unsigned KReg = countTrailingZeros(Kills);
+ DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+ freeStackSlotBefore(I, KReg);
+ Kills &= ~(1 << KReg);
+ }
+
+ // Load zeros for all the imp-defs.
+ while(Defs) {
+ unsigned DReg = countTrailingZeros(Defs);
+ DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+ BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+ pushReg(DReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Now we should have the correct registers live.
+ DEBUG(dumpStack());
+ assert(StackTop == countPopulation(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+ unsigned FixCount,
+ MachineBasicBlock::iterator I) {
+ // Move items into place, starting from the desired stack bottom.
+ while (FixCount--) {
+ // Old register at position FixCount.
+ unsigned OldReg = getStackEntry(FixCount);
+ // Desired register at position FixCount.
+ unsigned Reg = FixStack[FixCount];
+ if (Reg == OldReg)
+ continue;
+ // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+ moveToTop(Reg, I);
+ if (FixCount > 0)
+ moveToTop(OldReg, I);
+ }
+ DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+ unsigned STReturns = 0;
+
+ for (const auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned R = MO.getReg() - X86::FP0;
+
+ if (R < 8) {
+ assert(MO.isDef() && MO.isImplicit());
+ STReturns |= 1 << R;
+ }
+ }
+
+ unsigned N = countTrailingOnes(STReturns);
+
+ // FP registers used for function return must be consecutive starting at
+ // FP0.
+ assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+
+ for (unsigned I = 0; I < N; ++I)
+ pushReg(N - I - 1);
+}
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned DestReg = getFPReg(MI->getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+ //
+ if (!KillsSrc &&
+ (MI->getOpcode() == X86::IST_Fp64m32 ||
+ MI->getOpcode() == X86::ISTT_Fp16m32 ||
+ MI->getOpcode() == X86::ISTT_Fp32m32 ||
+ MI->getOpcode() == X86::ISTT_Fp64m32 ||
+ MI->getOpcode() == X86::IST_Fp64m64 ||
+ MI->getOpcode() == X86::ISTT_Fp16m64 ||
+ MI->getOpcode() == X86::ISTT_Fp32m64 ||
+ MI->getOpcode() == X86::ISTT_Fp64m64 ||
+ MI->getOpcode() == X86::IST_Fp64m80 ||
+ MI->getOpcode() == X86::ISTT_Fp16m80 ||
+ MI->getOpcode() == X86::ISTT_Fp32m80 ||
+ MI->getOpcode() == X86::ISTT_Fp64m80 ||
+ MI->getOpcode() == X86::ST_FpP80m)) {
+ duplicateToTop(Reg, ScratchFPReg, I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ if (MI->getOpcode() == X86::IST_FP64m ||
+ MI->getOpcode() == X86::ISTT_FP16m ||
+ MI->getOpcode() == X86::ISTT_FP32m ||
+ MI->getOpcode() == X86::ISTT_FP64m ||
+ MI->getOpcode() == X86::ST_FP80m) {
+ if (StackTop == 0)
+ report_fatal_error("Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+#ifndef NDEBUG
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ if (KillsSrc) {
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ if (StackTop == 0)
+ report_fatal_error("Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI->getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(1); // Drop the source operand.
+ MI->RemoveOperand(0); // Drop the destination operand.
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::ADD_Fp80 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::DIV_Fp80 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::MUL_Fp80 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+ { X86::SUB_Fp80 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::DIV_Fp80 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+ { X86::SUB_Fp80 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp80 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::ADD_Fp80 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::DIV_Fp80 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::MUL_Fp80 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+ { X86::SUB_Fp80 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI->getOperand(0));
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+ DebugLoc dl = MI->getDebugLoc();
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ ArrayRef<TableEntry> InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, MI->getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(I++);
+ I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->RemoveOperand(1);
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+
+ unsigned Op0 = getFPReg(MI->getOperand(0));
+ unsigned Op1 = getFPReg(MI->getOperand(2));
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0);
+ MI->RemoveOperand(1);
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
+ MachineInstr *MI = Inst;
+
+ if (MI->isCall()) {
+ handleCall(Inst);
+ return;
+ }
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unknown SpecialFP instruction!");
+ case TargetOpcode::COPY: {
+ // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+ const MachineOperand &MO1 = MI->getOperand(1);
+ const MachineOperand &MO0 = MI->getOperand(0);
+ bool KillsSrc = MI->killsRegister(MO1.getReg());
+
+ // FP <- FP copy.
+ unsigned DstFP = getFPReg(MO0);
+ unsigned SrcFP = getFPReg(MO1);
+ assert(isLive(SrcFP) && "Cannot copy dead register");
+ if (KillsSrc) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcFP);
+ Stack[Slot] = DstFP;
+ RegMap[DstFP] = Slot;
+ } else {
+ // For COPY we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcFP, DstFP, Inst);
+ }
+ break;
+ }
+
+ case TargetOpcode::IMPLICIT_DEF: {
+ // All FP registers must be explicitly defined, so load a 0 instead.
+ unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+ DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+ BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0));
+ pushReg(Reg);
+ break;
+ }
+
+ case TargetOpcode::INLINEASM: {
+ // The inline asm MachineInstr currently only *uses* FP registers for the
+ // 'f' constraint. These should be turned into the current ST(x) register
+ // in the machine instr.
+ //
+ // There are special rules for x87 inline assembly. The compiler must know
+ // exactly how many registers are popped and pushed implicitly by the asm.
+ // Otherwise it is not possible to restore the stack state after the inline
+ // asm.
+ //
+ // There are 3 kinds of input operands:
+ //
+ // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+ // popped input operand must be in a fixed stack slot, and it is either
+ // tied to an output operand, or in the clobber list. The MI has ST use
+ // and def operands for these inputs.
+ //
+ // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+ // preserved by the inline asm. The fixed stack slots must be STn-STm
+ // following the popped inputs. A fixed input operand cannot be tied to
+ // an output or appear in the clobber list. The MI has ST use operands
+ // and no defs for these inputs.
+ //
+ // 3. Preserved inputs. These inputs use the "f" constraint which is
+ // represented as an FP register. The inline asm won't change these
+ // stack slots.
+ //
+ // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+ // registers do not count as output operands. The inline asm changes the
+ // stack as if it popped all the popped inputs and then pushed all the
+ // output operands.
+
+ // Scan the assembly for ST registers used, defined and clobbered. We can
+ // only tell clobbers from defs by looking at the asm descriptor.
+ unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+ unsigned NumOps = 0;
+ SmallSet<unsigned, 1> FRegIdx;
+ unsigned RCID;
+
+ for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
+ i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
+ unsigned Flags = MI->getOperand(i).getImm();
+
+ NumOps = InlineAsm::getNumOperandRegisters(Flags);
+ if (NumOps != 1)
+ continue;
+ const MachineOperand &MO = MI->getOperand(i + 1);
+ if (!MO.isReg())
+ continue;
+ unsigned STReg = MO.getReg() - X86::FP0;
+ if (STReg >= 8)
+ continue;
+
+ // If the flag has a register class constraint, this must be an operand
+ // with constraint "f". Record its index and continue.
+ if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+ FRegIdx.insert(i + 1);
+ continue;
+ }
+
+ switch (InlineAsm::getKind(Flags)) {
+ case InlineAsm::Kind_RegUse:
+ STUses |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_RegDef:
+ case InlineAsm::Kind_RegDefEarlyClobber:
+ STDefs |= (1u << STReg);
+ if (MO.isDead())
+ STDeadDefs |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_Clobber:
+ STClobbers |= (1u << STReg);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (STUses && !isMask_32(STUses))
+ MI->emitError("fixed input regs must be last on the x87 stack");
+ unsigned NumSTUses = countTrailingOnes(STUses);
+
+ // Defs must be contiguous from the stack top. ST0-STn.
+ if (STDefs && !isMask_32(STDefs)) {
+ MI->emitError("output regs must be last on the x87 stack");
+ STDefs = NextPowerOf2(STDefs) - 1;
+ }
+ unsigned NumSTDefs = countTrailingOnes(STDefs);
+
+ // So must the clobbered stack slots. ST0-STm, m >= n.
+ if (STClobbers && !isMask_32(STDefs | STClobbers))
+ MI->emitError("clobbers must be last on the x87 stack");
+
+ // Popped inputs are the ones that are also clobbered or defined.
+ unsigned STPopped = STUses & (STDefs | STClobbers);
+ if (STPopped && !isMask_32(STPopped))
+ MI->emitError("implicitly popped regs must be last on the x87 stack");
+ unsigned NumSTPopped = countTrailingOnes(STPopped);
+
+ DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+ << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+#ifndef NDEBUG
+ // If any input operand uses constraint "f", all output register
+ // constraints must be early-clobber defs.
+ for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+ if (FRegIdx.count(I)) {
+ assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+ "Operands with constraint \"f\" cannot overlap with defs");
+ }
+#endif
+
+ // Collect all FP registers (register operands with constraints "t", "u",
+ // and "f") to kill afer the instruction.
+ unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ unsigned FPReg = getFPReg(Op);
+
+ // If we kill this operand, make sure to pop it from the stack after the
+ // asm. We just remember it for now, and pop them all off at the end in
+ // a batch.
+ if (Op.isUse() && Op.isKill())
+ FPKills |= 1U << FPReg;
+ }
+
+ // Do not include registers that are implicitly popped by defs/clobbers.
+ FPKills &= ~(STDefs | STClobbers);
+
+ // Now we can rearrange the live registers to match what was requested.
+ unsigned char STUsesArray[8];
+
+ for (unsigned I = 0; I < NumSTUses; ++I)
+ STUsesArray[I] = I;
+
+ shuffleStackTop(STUsesArray, NumSTUses, Inst);
+ DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+ // With the stack layout fixed, rewrite the FP registers.
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+
+ unsigned FPReg = getFPReg(Op);
+
+ if (FRegIdx.count(i))
+ // Operand with constraint "f".
+ Op.setReg(getSTReg(FPReg));
+ else
+ // Operand with a single register class constraint ("t" or "u").
+ Op.setReg(X86::ST0 + FPReg);
+ }
+
+ // Simulate the inline asm popping its inputs and pushing its outputs.
+ StackTop -= NumSTPopped;
+
+ for (unsigned i = 0; i < NumSTDefs; ++i)
+ pushReg(NumSTDefs - i - 1);
+
+ // If this asm kills any FP registers (is the last use of them) we must
+ // explicitly emit pop instructions for them. Do this now after the asm has
+ // executed so that the ST(x) numbers are not off (which would happen if we
+ // did this inline with operand rewriting).
+ //
+ // Note: this might be a non-optimal pop sequence. We might be able to do
+ // better by trying to pop in stack order or something.
+ while (FPKills) {
+ unsigned FPReg = countTrailingZeros(FPKills);
+ if (isLive(FPReg))
+ freeStackSlotAfter(Inst, FPReg);
+ FPKills &= ~(1U << FPReg);
+ }
+
+ // Don't delete the inline asm!
+ return;
+ }
+
+ case X86::RETQ:
+ case X86::RETL:
+ case X86::RETIL:
+ case X86::RETIQ:
+ // If RET has an FP register use operand, pass the first one in ST(0) and
+ // the second one in ST(1).
+
+ // Find the register operands.
+ unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+ unsigned LiveMask = 0;
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ // FP Register uses must be kills unless there are two uses of the same
+ // register, in which case only one will be a kill.
+ assert(Op.isUse() &&
+ (Op.isKill() || // Marked kill.
+ getFPReg(Op) == FirstFPRegOp || // Second instance.
+ MI->killsRegister(Op.getReg())) && // Later use is marked kill.
+ "Ret only defs operands, and values aren't live beyond it");
+
+ if (FirstFPRegOp == ~0U)
+ FirstFPRegOp = getFPReg(Op);
+ else {
+ assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+ SecondFPRegOp = getFPReg(Op);
+ }
+ LiveMask |= (1 << getFPReg(Op));
+
+ // Remove the operand so that later passes don't see it.
+ MI->RemoveOperand(i);
+ --i, --e;
+ }
+
+ // We may have been carrying spurious live-ins, so make sure only the returned
+ // registers are left live.
+ adjustLiveRegs(LiveMask, MI);
+ if (!LiveMask) return; // Quick check to see if any are possible.
+
+ // There are only four possibilities here:
+ // 1) we are returning a single FP value. In this case, it has to be in
+ // ST(0) already, so just declare success by removing the value from the
+ // FP Stack.
+ if (SecondFPRegOp == ~0U) {
+ // Assert that the top of stack contains the right FP register.
+ assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+ "Top of stack not the right register for RET!");
+
+ // Ok, everything is good, mark the value as not being on the stack
+ // anymore so that our assertion about the stack being empty at end of
+ // block doesn't fire.
+ StackTop = 0;
+ return;
+ }
+
+ // Otherwise, we are returning two values:
+ // 2) If returning the same value for both, we only have one thing in the FP
+ // stack. Consider: RET FP1, FP1
+ if (StackTop == 1) {
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ "Stack misconfiguration for RET!");
+
+ // Duplicate the TOS so that we return it twice. Just pick some other FPx
+ // register to hold it.
+ unsigned NewReg = ScratchFPReg;
+ duplicateToTop(FirstFPRegOp, NewReg, MI);
+ FirstFPRegOp = NewReg;
+ }
+
+ /// Okay we know we have two different FPx operands now:
+ assert(StackTop == 2 && "Must have two values live!");
+
+ /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+ /// in ST(1). In this case, emit an fxch.
+ if (getStackEntry(0) == SecondFPRegOp) {
+ assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+ moveToTop(FirstFPRegOp, MI);
+ }
+
+ /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+ /// ST(1). Just remove both from our understanding of the stack and return.
+ assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+ assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+ StackTop = 0;
+ return;
+ }
+
+ Inst = MBB->erase(Inst); // Remove the pseudo instruction
+
+ // We want to leave I pointing to the previous instruction, but what if we
+ // just erased the first instruction?
+ if (Inst == MBB->begin()) {
+ DEBUG(dbgs() << "Inserting dummy KILL\n");
+ Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
+ } else
+ --Inst;
+}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ LivePhysRegs LPR(TRI);
+
+ LPR.addLiveOuts(&MBB);
+
+ for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+ I != E; ++I) {
+ if (I->isDebugValue())
+ continue;
+
+ std::bitset<8> Defs;
+ SmallVector<MachineOperand *, 2> Uses;
+ MachineInstr &MI = *I;
+
+ for (auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg() - X86::FP0;
+
+ if (Reg >= 8)
+ continue;
+
+ if (MO.isDef()) {
+ Defs.set(Reg);
+ if (!LPR.contains(MO.getReg()))
+ MO.setIsDead();
+ } else
+ Uses.push_back(&MO);
+ }
+
+ for (auto *MO : Uses)
+ if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+ MO->setIsKill();
+
+ LPR.stepBackward(MI);
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 0000000..8b5fd27
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,2698 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
+ unsigned StackAlignOverride)
+ : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+ STI.is64Bit() ? -8 : -4),
+ STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+ // Cache a bunch of frame-related predicates for this subtarget.
+ SlotSize = TRI->getSlotSize();
+ Is64Bit = STI.is64Bit();
+ IsLP64 = STI.isTarget64BitLP64();
+ // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ StackPtr = TRI->getStackRegister();
+}
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo()->hasVarSizedObjects() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified. Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+ return hasReservedCallFrame(MF) ||
+ (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
+ TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ return MF.getFrameInfo()->hasStackObjects() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineModuleInfo &MMI = MF.getMMI();
+
+ return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ TRI->needsStackRealignment(MF) ||
+ MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
+ MFI->hasStackMap() || MFI->hasPatchPoint());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::ADD64ri8;
+ return X86::ADD64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::ADD32ri8;
+ return X86::ADD32ri;
+ }
+}
+
+static unsigned getSUBrrOpcode(unsigned isLP64) {
+ return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(unsigned isLP64) {
+ return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
+static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::AND64ri8;
+ return X86::AND64ri32;
+ }
+ if (isInt<8>(Imm))
+ return X86::AND32ri8;
+ return X86::AND32ri;
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+ return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const X86RegisterInfo *TRI,
+ bool Is64Bit) {
+ const MachineFunction *MF = MBB.getParent();
+ const Function *F = MF->getFunction();
+ if (!F || MF->getMMI().callsEHReturn())
+ return 0;
+
+ const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
+
+ unsigned Opc = MBBI->getOpcode();
+ switch (Opc) {
+ default: return 0;
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ SmallSet<uint16_t, 8> Uses;
+ for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MBBI->getOperand(i);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS) && CS != X86::RIP)
+ return CS;
+ }
+ }
+
+ return 0;
+}
+
+static bool isEAXLiveIn(MachineFunction &MF) {
+ for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+ EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+ unsigned Reg = II->first;
+
+ if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL)
+ return true;
+ }
+
+ return false;
+}
+
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
+ for (const MachineInstr &MI : MBB.terminators()) {
+ bool BreakNext = false;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg != X86::EFLAGS)
+ continue;
+
+ // This terminator needs an eflags that is not defined
+ // by a previous another terminator:
+ // EFLAGS is live-in of the region composed by the terminators.
+ if (!MO.isDef())
+ return true;
+ // This terminator defines the eflags, i.e., we don't need to preserve it.
+ // However, we still need to check this specific terminator does not
+ // read a live-in value.
+ BreakNext = true;
+ }
+ // We found a definition of the eflags, no need to preserve them.
+ if (BreakNext)
+ return false;
+ }
+
+ // None of the terminators use or define the eflags.
+ // Check if they are live-out, that would imply we need to preserve them.
+ for (const MachineBasicBlock *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ return true;
+
+ return false;
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ int64_t NumBytes, bool InEpilogue) const {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+
+ uint64_t Chunk = (1LL << 31) - 1;
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ while (Offset) {
+ if (Offset > Chunk) {
+ // Rather than emit a long series of instructions for large offsets,
+ // load the offset into a register and do one sub/add
+ unsigned Reg = 0;
+
+ if (isSub && !isEAXLiveIn(*MBB.getParent()))
+ Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+ else
+ Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+ if (Reg) {
+ unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
+ .addImm(Offset);
+ Opc = isSub
+ ? getSUBrrOpcode(Is64Bit)
+ : getADDrrOpcode(Is64Bit);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addReg(Reg);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ Offset = 0;
+ continue;
+ }
+ }
+
+ uint64_t ThisVal = std::min(Offset, Chunk);
+ if (ThisVal == (Is64Bit ? 8 : 4)) {
+ // Use push / pop instead.
+ unsigned Reg = isSub
+ ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+ : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+ if (Reg) {
+ unsigned Opc = isSub
+ ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+ : (Is64Bit ? X86::POP64r : X86::POP32r);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+ .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+ if (isSub)
+ MI->setFlag(MachineInstr::FrameSetup);
+ else
+ MI->setFlag(MachineInstr::FrameDestroy);
+ Offset -= ThisVal;
+ continue;
+ }
+ }
+
+ MachineInstrBuilder MI = BuildStackAdjustment(
+ MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
+ if (isSub)
+ MI.setMIFlag(MachineInstr::FrameSetup);
+ else
+ MI.setMIFlag(MachineInstr::FrameDestroy);
+
+ Offset -= ThisVal;
+ }
+}
+
+MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ int64_t Offset, bool InEpilogue) const {
+ assert(Offset != 0 && "zero offset stack adjustment requested");
+
+ // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
+ // is tricky.
+ bool UseLEA;
+ if (!InEpilogue) {
+ // Check if inserting the prologue at the beginning
+ // of MBB would require to use LEA operations.
+ // We need to use LEA operations if EFLAGS is live in, because
+ // it means an instruction will read it before it gets defined.
+ UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
+ } else {
+ // If we can use LEA for SP but we shouldn't, check that none
+ // of the terminators uses the eflags. Otherwise we will insert
+ // a ADD that will redefine the eflags and break the condition.
+ // Alternatively, we could move the ADD, but this may not be possible
+ // and is an optimization anyway.
+ UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
+ if (UseLEA && !STI.useLeaForSP())
+ UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
+ // If that assert breaks, that means we do not do the right thing
+ // in canUseAsEpilogue.
+ assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
+ "We shouldn't have allowed this insertion point");
+ }
+
+ MachineInstrBuilder MI;
+ if (UseLEA) {
+ MI = addRegOffset(BuildMI(MBB, MBBI, DL,
+ TII.get(getLEArOpcode(Uses64BitFramePtr)),
+ StackPtr),
+ StackPtr, false, Offset);
+ } else {
+ bool IsSub = Offset < 0;
+ uint64_t AbsOffset = IsSub ? -Offset : Offset;
+ unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+ : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+ MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(AbsOffset);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+ return MI;
+}
+
+int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const {
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+ : std::next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ int Offset = 0;
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+ Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ }
+
+ return Offset;
+}
+
+void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ MCCFIInstruction CFIInst) const {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst);
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+void
+X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ if (CSI.empty()) return;
+
+ // Calculate offsets.
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ }
+}
+
+MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL,
+ bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR()) {
+ if (InProlog) {
+ return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+ } else {
+ return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+ }
+ } else {
+ return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+ }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const {
+ const StringRef ChkStkStubSymbol = "__chkstk_stub";
+ MachineInstr *ChkStkStub = nullptr;
+
+ for (MachineInstr &MI : PrologMBB) {
+ if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+ ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+ ChkStkStub = &MI;
+ break;
+ }
+ }
+
+ if (ChkStkStub != nullptr) {
+ MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+ assert(std::prev(MBBI).operator==(ChkStkStub) &&
+ "MBBI expected after __chkstk_stub.");
+ DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+ emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+ ChkStkStub->eraseFromParent();
+ }
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInline(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ assert(STI.is64Bit() && "different expansion needed for 32 bit");
+ assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ // RAX contains the number of bytes of desired stack adjustment.
+ // The handling here assumes this value has already been updated so as to
+ // maintain stack alignment.
+ //
+ // We need to exit with RSP modified by this amount and execute suitable
+ // page touches to notify the OS that we're growing the stack responsibly.
+ // All stack probing must be done without modifying RSP.
+ //
+ // MBB:
+ // SizeReg = RAX;
+ // ZeroReg = 0
+ // CopyReg = RSP
+ // Flags, TestReg = CopyReg - SizeReg
+ // FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+ // LimitReg = gs magic thread env access
+ // if FinalReg >= LimitReg goto ContinueMBB
+ // RoundBB:
+ // RoundReg = page address of FinalReg
+ // LoopMBB:
+ // LoopReg = PHI(LimitReg,ProbeReg)
+ // ProbeReg = LoopReg - PageSize
+ // [ProbeReg] = 0
+ // if (ProbeReg > RoundReg) goto LoopMBB
+ // ContinueMBB:
+ // RSP = RSP - RAX
+ // [rest of original MBB]
+
+ // Set up the new basic blocks
+ MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+ MF.insert(MBBIter, RoundMBB);
+ MF.insert(MBBIter, LoopMBB);
+ MF.insert(MBBIter, ContinueMBB);
+
+ // Split MBB and move the tail portion down to ContinueMBB.
+ MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+ ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+ ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ // Some useful constants
+ const int64_t ThreadEnvironmentStackLimit = 0x10;
+ const int64_t PageSize = 0x1000;
+ const int64_t PageMask = ~(PageSize - 1);
+
+ // Registers we need. For the normal case we use virtual
+ // registers. For the prolog expansion we use RAX, RCX and RDX.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+ const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+ : MRI.createVirtualRegister(RegClass),
+ ZeroReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ CopyReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ TestReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ FinalReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ RoundedReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ LimitReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ JoinReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ ProbeReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass);
+
+ // SP-relative offsets where we can save RCX and RDX.
+ int64_t RCXShadowSlot = 0;
+ int64_t RDXShadowSlot = 0;
+
+ // If inlining in the prolog, save RCX and RDX.
+ // Future optimization: don't save or restore if not live in.
+ if (InProlog) {
+ // Compute the offsets. We need to account for things already
+ // pushed onto the stack at this point: return address, frame
+ // pointer (if used), and callee saves.
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+ const bool HasFP = hasFP(MF);
+ RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+ RDXShadowSlot = RCXShadowSlot + 8;
+ // Emit the saves.
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RCXShadowSlot)
+ .addReg(X86::RCX);
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RDXShadowSlot)
+ .addReg(X86::RDX);
+ } else {
+ // Not in the prolog. Copy RAX to a virtual reg.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+ }
+
+ // Add code to MBB to check for overflow and set the new target stack pointer
+ // to zero if so.
+ BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+ .addReg(ZeroReg, RegState::Undef)
+ .addReg(ZeroReg, RegState::Undef);
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+ BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+ .addReg(CopyReg)
+ .addReg(SizeReg);
+ BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+ .addReg(TestReg)
+ .addReg(ZeroReg);
+
+ // FinalReg now holds final stack pointer value, or zero if
+ // allocation would overflow. Compare against the current stack
+ // limit from the thread environment block. Note this limit is the
+ // lowest touched page on the stack, not the point at which the OS
+ // will cause an overflow exception, so this is just an optimization
+ // to avoid unnecessarily touching pages that are below the current
+ // SP but already commited to the stack by the OS.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+ .addReg(0)
+ .addImm(1)
+ .addReg(0)
+ .addImm(ThreadEnvironmentStackLimit)
+ .addReg(X86::GS);
+ BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+ // Jump if the desired stack pointer is at or above the stack limit.
+ BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+ // Add code to roundMBB to round the final stack pointer to a page boundary.
+ BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+ .addReg(FinalReg)
+ .addImm(PageMask);
+ BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+ // LimitReg now holds the current stack limit, RoundedReg page-rounded
+ // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+ // and probe until we reach RoundedReg.
+ if (!InProlog) {
+ BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+ .addReg(LimitReg)
+ .addMBB(RoundMBB)
+ .addReg(ProbeReg)
+ .addMBB(LoopMBB);
+ }
+
+ addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+ false, -PageSize);
+
+ // Probe by storing a byte onto the stack.
+ BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+ .addReg(ProbeReg)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addImm(0);
+ BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+ .addReg(RoundedReg)
+ .addReg(ProbeReg);
+ BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+ MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+ // If in prolog, restore RDX and RCX.
+ if (InProlog) {
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RCX),
+ X86::RSP, false, RCXShadowSlot);
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RDX),
+ X86::RSP, false, RDXShadowSlot);
+ }
+
+ // Now that the probing is done, add code to continueMBB to update
+ // the stack pointer for real.
+ BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(SizeReg);
+
+ // Add the control flow edges we need.
+ MBB.addSuccessor(ContinueMBB);
+ MBB.addSuccessor(RoundMBB);
+ RoundMBB->addSuccessor(LoopMBB);
+ LoopMBB->addSuccessor(ContinueMBB);
+ LoopMBB->addSuccessor(LoopMBB);
+
+ // Mark all the instructions added to the prolog as frame setup.
+ if (InProlog) {
+ for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+ BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *RoundMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *LoopMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+ CMBBI != ContinueMBBI; ++CMBBI) {
+ CMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // Possible TODO: physreg liveness for InProlog case.
+
+ return ContinueMBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeCall(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+ bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+
+ unsigned CallOp;
+ if (Is64Bit)
+ CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+ else
+ CallOp = X86::CALLpcrel32;
+
+ const char *Symbol;
+ if (Is64Bit) {
+ if (STI.isTargetCygMing()) {
+ Symbol = "___chkstk_ms";
+ } else {
+ Symbol = "__chkstk";
+ }
+ } else if (STI.isTargetCygMing())
+ Symbol = "_alloca";
+ else
+ Symbol = "_chkstk";
+
+ MachineInstrBuilder CI;
+ MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
+
+ // All current stack probes take AX and SP as input, clobber flags, and
+ // preserve all registers. x86_64 probes leave RSP unmodified.
+ if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+ // For the large code model, we have to call through a register. Use R11,
+ // as it is scratch in all supported calling conventions.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+ .addExternalSymbol(Symbol);
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+ } else {
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+ }
+
+ unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
+ unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+ CI.addReg(AX, RegState::Implicit)
+ .addReg(SP, RegState::Implicit)
+ .addReg(AX, RegState::Define | RegState::Implicit)
+ .addReg(SP, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+ if (Is64Bit) {
+ // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+ // themselves. It also does not clobber %rax so we can reuse it when
+ // adjusting %rsp.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(X86::RAX);
+ }
+
+ if (InProlog) {
+ // Apply the frame setup flag to all inserted instrs.
+ for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+ ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+
+ return MBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+
+ assert(InProlog && "ChkStkStub called outside prolog!");
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__chkstk_stub");
+
+ return MBBI;
+}
+
+static unsigned calculateSetFPREG(uint64_t SPAdjust) {
+ // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
+ // and might require smaller successive adjustments.
+ const uint64_t Win64MaxSEHOffset = 128;
+ uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
+ // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
+ return SEHFrameOffset & -16;
+}
+
+// If we're forcing a stack realignment we can't rely on just the frame
+// info, we need to know the ABI stack alignment as well in case we
+// have a call out. Otherwise just make sure we have some alignment - we'll
+// go with the minimum SlotSize.
+uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
+ unsigned StackAlign = getStackAlignment();
+ if (MF.getFunction()->hasFnAttribute("stackrealign")) {
+ if (MFI->hasCalls())
+ MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+ else if (MaxAlign < SlotSize)
+ MaxAlign = SlotSize;
+ }
+ return MaxAlign;
+}
+
+void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, unsigned Reg,
+ uint64_t MaxAlign) const {
+ uint64_t Val = -MaxAlign;
+ unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+ .addReg(Reg)
+ .addImm(Val)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+
+/*
+ Here's a gist of what gets emitted:
+
+ ; Establish frame pointer, if needed
+ [if needs FP]
+ push %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ .seh_pushreg %rpb
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+
+ ; Spill general-purpose registers
+ [for all callee-saved GPRs]
+ pushq %<reg>
+ [if not needs FP]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ .seh_pushreg %<reg>
+
+ ; If the required stack alignment > default stack alignment
+ ; rsp needs to be re-aligned. This creates a "re-alignment gap"
+ ; of unknown size in the stack frame.
+ [if stack needs re-alignment]
+ and $MASK, %rsp
+
+ ; Allocate space for locals
+ [if target is Windows and allocated space > 4096 bytes]
+ ; Windows needs special care for allocations larger
+ ; than one page.
+ mov $NNN, %rax
+ call ___chkstk_ms/___chkstk
+ sub %rax, %rsp
+ [else]
+ sub $NNN, %rsp
+
+ [if needs FP]
+ .seh_stackalloc (size of XMM spill slots)
+ .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+ [else]
+ .seh_stackalloc NNN
+
+ ; Spill XMMs
+ ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+ ; they may get spilled on any platform, if the current function
+ ; calls @llvm.eh.unwind.init
+ [if needs FP]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, -MMM(%rbp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+ ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+ [else]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, KKK(%rsp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, KKK
+
+ .seh_endprologue
+
+ [if needs base pointer]
+ mov %rsp, %rbx
+ [if needs to restore base pointer]
+ mov %rsp, -MMM(%rbp)
+
+ ; Emit CFI info
+ [if needs FP]
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rbp)
+ [else]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rsp)
+
+ Notes:
+ - .seh directives are emitted only for Windows 64 ABI
+ - .cfi directives are emitted for all other ABIs
+ - for 32-bit code, substitute %e?? registers for %r??
+*/
+
+void X86FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *Fn = MF.getFunction();
+ MachineModuleInfo &MMI = MF.getMMI();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
+ uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
+ bool IsFunclet = MBB.isEHFuncletEntry();
+ EHPersonality Personality = EHPersonality::Unknown;
+ if (Fn->hasPersonalityFn())
+ Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ bool FnHasClrFunclet =
+ MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+ bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
+ bool HasFP = hasFP(MF);
+ bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry();
+ bool NeedsDwarfCFI =
+ !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ const unsigned MachineFramePtr =
+ STI.isTarget64BitILP32()
+ ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+ unsigned BasePtr = TRI->getBaseRegister();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta && IsWin64Prologue)
+ report_fatal_error("Can't handle guaranteed tail call under win64 yet");
+
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+ bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+
+ // The default stack probe size is 4096 if the function has no stackprobesize
+ // attribute.
+ unsigned StackProbeSize = 4096;
+ if (Fn->hasFnAttribute("stack-probe-size"))
+ Fn->getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+
+ // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+ // function, and use up to 128 bytes of stack space, don't have a frame
+ // pointer, calls, or dynamic alloca then we do not need to adjust the
+ // stack pointer (we fit in the Red Zone). We also check that we don't
+ // push and pop from the stack.
+ if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
+ !TRI->needsStackRealignment(MF) &&
+ !MFI->hasVarSizedObjects() && // No dynamic alloca.
+ !MFI->adjustsStack() && // No calls.
+ !IsWin64CC && // Win64 has no Red Zone
+ !MFI->hasOpaqueSPAdjustment() && // Don't push and pop.
+ !MF.shouldSplitStack()) { // Regular stack
+ uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+ if (HasFP) MinSize += SlotSize;
+ StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+ MFI->setStackSize(StackSize);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+ /*InEpilogue=*/false)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Mapping for machine moves:
+ //
+ // DST: VirtualFP AND
+ // SRC: VirtualFP => DW_CFA_def_cfa_offset
+ // ELSE => DW_CFA_def_cfa
+ //
+ // SRC: VirtualFP AND
+ // DST: Register => DW_CFA_def_cfa_register
+ //
+ // ELSE
+ // OFFSET < 0 => DW_CFA_offset_extended_sf
+ // REG < 64 => DW_CFA_offset + Reg
+ // ELSE => DW_CFA_offset_extended
+
+ uint64_t NumBytes = 0;
+ int stackGrowth = -SlotSize;
+
+ // Find the funclet establisher parameter
+ unsigned Establisher = X86::NoRegister;
+ if (IsClrFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+ else if (IsFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+ if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+ // Immediately spill establisher into the home slot.
+ // The runtime cares about this.
+ // MOV64mr %rdx, 16(%rsp)
+ unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+ .addReg(Establisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(Establisher);
+ }
+
+ if (HasFP) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base pointer.
+ if (X86FI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+ // Callee-saved registers are pushed on stack before the stack is realigned.
+ if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ NumBytes = RoundUpToAlignment(NumBytes, MaxAlign);
+
+ // Get the offset of the stack slot for the EBP register, which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ if (!IsFunclet)
+ MFI->setOffsetAdjustment(-NumBytes);
+ else
+ assert(MFI->getOffsetAdjustment() == -(int)NumBytes &&
+ "should calculate same local variable offset for funclets");
+
+ // Save EBP/RBP into the appropriate stack slot.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(MachineFramePtr, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark the place where EBP/RBP was saved.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+
+ // Change the rule for the FramePtr to be an "offset" rule.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
+ nullptr, DwarfFramePtr, 2 * stackGrowth));
+ }
+
+ if (NeedsWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (!IsWin64Prologue && !IsFunclet) {
+ // Update EBP with the new base value.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+ FramePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ // Define the current CFA to use the EBP/RBP register.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+ nullptr, DwarfFramePtr));
+ }
+ }
+
+ // Mark the FramePtr as live-in in every block. Don't do this again for
+ // funclet prologues.
+ if (!IsFunclet) {
+ for (MachineBasicBlock &EveryMBB : MF)
+ EveryMBB.addLiveIn(MachineFramePtr);
+ }
+ } else {
+ assert(!IsFunclet && "funclets without FPs not yet implemented");
+ NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ // For EH funclets, only allocate enough space for outgoing calls. Save the
+ // NumBytes value that we would've used for the parent frame.
+ unsigned ParentFrameNumBytes = NumBytes;
+ if (IsFunclet)
+ NumBytes = getWinEHFuncletFrameSize(MF);
+
+ // Skip the callee-saved push instructions.
+ bool PushedRegs = false;
+ int StackOffset = 2 * stackGrowth;
+
+ while (MBBI != MBB.end() &&
+ MBBI->getFlag(MachineInstr::FrameSetup) &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r)) {
+ PushedRegs = true;
+ unsigned Reg = MBBI->getOperand(0).getReg();
+ ++MBBI;
+
+ if (!HasFP && NeedsDwarfCFI) {
+ // Mark callee-saved push instruction.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+ StackOffset += stackGrowth;
+ }
+
+ if (NeedsWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+ MachineInstr::FrameSetup);
+ }
+ }
+
+ // Realign stack after we pushed callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ // Don't do this for Win64, it needs to realign the stack after the prologue.
+ if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
+ }
+
+ // If there is an SUB32ri of ESP immediately before this instruction, merge
+ // the two. This can be the case when tail call elimination is enabled and
+ // the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+
+ // Adjust stack pointer: ESP -= numbytes.
+
+ // Windows and cygwin/mingw require a prologue helper routine when allocating
+ // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
+ // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
+ // stack and adjust the stack pointer in one go. The 64-bit version of
+ // __chkstk is only responsible for probing the stack. The 64-bit prologue is
+ // responsible for adjusting the stack pointer. Touching the stack at 4K
+ // increments is necessary to ensure that the guard pages used by the OS
+ // virtual memory manager are allocated in correct sequence.
+ uint64_t AlignedNumBytes = NumBytes;
+ if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
+ AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
+ if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+ // Check whether EAX is livein for this function.
+ bool isEAXAlive = isEAXLiveIn(MF);
+
+ if (isEAXAlive) {
+ // Sanity check that EAX is not livein for this function.
+ // It should not be, so throw an assert.
+ assert(!Is64Bit && "EAX is livein in x64 case!");
+
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (Is64Bit) {
+ // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+ // Function prologue is responsible for adjusting the stack pointer.
+ if (isUInt<32>(NumBytes)) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else if (isInt<32>(NumBytes)) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ } else {
+ // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+ // We'll also use 4 already allocated bytes for EAX.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Call __chkstk, __chkstk_ms, or __alloca.
+ emitStackProbe(MF, MBB, MBBI, DL, true);
+
+ if (isEAXAlive) {
+ // Restore EAX
+ MachineInstr *MI =
+ addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+ StackPtr, false, NumBytes - 4);
+ MI->setFlag(MachineInstr::FrameSetup);
+ MBB.insert(MBBI, MI);
+ }
+ } else if (NumBytes) {
+ emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+ }
+
+ if (NeedsWinCFI && NumBytes)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ int SEHFrameOffset = 0;
+ unsigned SPOrEstablisher;
+ if (IsFunclet) {
+ if (IsClrFunclet) {
+ // The establisher parameter passed to a CLR funclet is actually a pointer
+ // to the (mostly empty) frame of its nearest enclosing funclet; we have
+ // to find the root function establisher frame by loading the PSPSym from
+ // the intermediate frame.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ MachinePointerInfo NoInfo;
+ MBB.addLiveIn(Establisher);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+ Establisher, false, PSPSlotOffset)
+ .addMemOperand(MF.getMachineMemOperand(
+ NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+ ;
+ // Save the root establisher back into the current funclet's (mostly
+ // empty) frame, in case a sub-funclet or the GC needs it.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+ false, PSPSlotOffset)
+ .addReg(Establisher)
+ .addMemOperand(
+ MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile,
+ SlotSize, SlotSize));
+ }
+ SPOrEstablisher = Establisher;
+ } else {
+ SPOrEstablisher = StackPtr;
+ }
+
+ if (IsWin64Prologue && HasFP) {
+ // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+ // this calculation on the incoming establisher, which holds the value of
+ // RSP from the parent frame at the end of the prologue.
+ SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
+ if (SEHFrameOffset)
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+ SPOrEstablisher, false, SEHFrameOffset);
+ else
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+ .addReg(SPOrEstablisher);
+
+ // If this is not a funclet, emit the CFI describing our frame pointer.
+ if (NeedsWinCFI && !IsFunclet) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(SEHFrameOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (isAsynchronousEHPersonality(Personality))
+ MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+ }
+ } else if (IsFunclet && STI.is32Bit()) {
+ // Reset EBP / ESI to something good for funclets.
+ MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+ // If we're a catch funclet, we can be returned to via catchret. Save ESP
+ // into the registration node so that the runtime will restore it for us.
+ if (!MBB.isCleanupFuncletEntry()) {
+ assert(Personality == EHPersonality::MSVC_CXX);
+ unsigned FrameReg;
+ int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+ int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+ // ESP is the first field, so no extra displacement is needed.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+ false, EHRegOffset)
+ .addReg(X86::ESP);
+ }
+ }
+
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ const MachineInstr *FrameInstr = &*MBBI;
+ ++MBBI;
+
+ if (NeedsWinCFI) {
+ int FI;
+ if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
+ if (X86::FR64RegClass.contains(Reg)) {
+ unsigned IgnoredFrameReg;
+ int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
+ Offset += SEHFrameOffset;
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+ .addImm(Reg)
+ .addImm(Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+ }
+ }
+
+ if (NeedsWinCFI)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (FnHasClrFunclet && !IsFunclet) {
+ // Save the so-called Initial-SP (i.e. the value of the stack pointer
+ // immediately after the prolog) into the PSPSlot so that funclets
+ // and the GC can recover it.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ auto PSPInfo = MachinePointerInfo::getFixedStack(
+ MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+ PSPSlotOffset)
+ .addReg(StackPtr)
+ .addMemOperand(MF.getMachineMemOperand(
+ PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+ SlotSize, SlotSize));
+ }
+
+ // Realign stack after we spilled callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ // Win64 requires aligning the stack after the prologue.
+ if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
+ }
+
+ // We already dealt with stack realignment and funclets above.
+ if (IsFunclet && STI.is32Bit())
+ return;
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (TRI->hasBasePointer(MF)) {
+ // Update the base pointer with the current stack pointer.
+ unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (X86FI->getRestoreBasePointer()) {
+ // Stash value of base pointer. Saving RSP instead of EBP shortens
+ // dependence chain. Used by SjLj EH.
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
+ // Stash the value of the frame pointer relative to the base pointer for
+ // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+ // it recovers the frame pointer from the base pointer rather than the
+ // other way around.
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ unsigned UsedReg;
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
+ .addReg(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+ // Mark end of stack pointer adjustment.
+ if (!HasFP && NumBytes) {
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+ nullptr, -StackSize + stackGrowth));
+ }
+
+ // Emit DWARF info specifying the offsets of the callee-saved registers.
+ if (PushedRegs)
+ emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+ }
+}
+
+bool X86FrameLowering::canUseLEAForSPInEpilogue(
+ const MachineFunction &MF) const {
+ // We can't use LEA instructions for adjusting the stack pointer if this is a
+ // leaf function in the Win64 ABI. Only ADD instructions may be used to
+ // deallocate the stack.
+ // This means that we can use LEA for SP in two situations:
+ // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
+ // 2. We *have* a frame pointer which means we are permitted to use LEA.
+ return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
+}
+
+static bool isFuncletReturnInstr(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame. The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals. To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+ const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+ // getFrameIndexReferenceFromSP has an out ref parameter for the stack
+ // pointer register; pass a dummy that we ignore
+ unsigned SPReg;
+ int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg);
+ assert(Offset >= 0);
+ return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+ // This is the size of the pushed CSRs.
+ unsigned CSSize =
+ MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // This is the amount of stack a funclet needs to allocate.
+ unsigned UsedSize;
+ EHPersonality Personality =
+ classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ // CLR funclets need to hold enough space to include the PSPSym, at the
+ // same offset from the stack pointer (immediately after the prolog) as it
+ // resides at in the main function.
+ UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+ } else {
+ // Other funclets just need enough stack for outgoing call arguments.
+ UsedSize = MF.getFrameInfo()->getMaxCallFrameSize();
+ }
+ // RBP is not included in the callee saved register block. After pushing RBP,
+ // everything is 16 byte aligned. Everything we allocate before an outgoing
+ // call must also be 16 byte aligned.
+ unsigned FrameSizeMinusRBP =
+ RoundUpToAlignment(CSSize + UsedSize, getStackAlignment());
+ // Subtract out the size of the callee saved registers. This is how much stack
+ // each funclet will allocate.
+ return FrameSizeMinusRBP - CSSize;
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ const bool Is64BitILP32 = STI.isTarget64BitILP32();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ unsigned MachineFramePtr =
+ Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsWinCFI =
+ IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+ bool IsFunclet = isFuncletReturnInstr(MBBI);
+ MachineBasicBlock *TargetMBB = nullptr;
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI->getStackSize();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF);
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ if (MBBI->getOpcode() == X86::CATCHRET) {
+ // SEH shouldn't use catchret.
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
+ "SEH should not use CATCHRET");
+
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+ TargetMBB = MBBI->getOperand(0).getMBB();
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else if (MBBI->getOpcode() == X86::CLEANUPRET) {
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else if (hasFP(MF)) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ NumBytes = FrameSize - CSSize;
+
+ // Callee-saved registers were pushed on stack before the stack was
+ // realigned.
+ if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ NumBytes = RoundUpToAlignment(FrameSize, MaxAlign);
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+ uint64_t SEHStackAllocAmt = NumBytes;
+
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = std::prev(MBBI);
+ unsigned Opc = PI->getOpcode();
+
+ if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ Opc != X86::DBG_VALUE && !PI->isTerminator())
+ break;
+
+ --MBBI;
+ }
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
+
+ if (TargetMBB) {
+ // Fill EAX/RAX with the address of the target block.
+ unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
+ if (STI.is64Bit()) {
+ // LEA64r TargetMBB(%rip), %rax
+ BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(TargetMBB)
+ .addReg(0);
+ } else {
+ // MOV32ri $TargetMBB, %eax
+ BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
+ .addMBB(TargetMBB);
+ }
+ // Record that we've taken the address of TargetMBB and no longer just
+ // reference it in a terminator.
+ TargetMBB->setHasAddressTaken();
+ }
+
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI->hasVarSizedObjects())
+ NumBytes += mergeSPUpdates(MBB, MBBI, true);
+
+ // If dynamic alloca is used, then reset esp to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned. Don't do this if this was a funclet epilogue, since the funclets
+ // will not do realignment or dynamic stack allocation.
+ if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) &&
+ !IsFunclet) {
+ if (TRI->needsStackRealignment(MF))
+ MBBI = FirstCSPop;
+ unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
+ uint64_t LEAAmount =
+ IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+
+ // There are only two legal forms of epilogue:
+ // - add SEHAllocationSize, %rsp
+ // - lea SEHAllocationSize(%FramePtr), %rsp
+ //
+ // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
+ // However, we may use this sequence if we have a frame pointer because the
+ // effects of the prologue can safely be undone.
+ if (LEAAmount != 0) {
+ unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, LEAAmount);
+ --MBBI;
+ } else {
+ unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(FramePtr);
+ --MBBI;
+ }
+ } else if (NumBytes) {
+ // Adjust stack pointer back: ESP += numbytes.
+ emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+ --MBBI;
+ }
+
+ // Windows unwinder will not invoke function's exception handler if IP is
+ // either in prologue or in epilogue. This behavior causes a problem when a
+ // call immediately precedes an epilogue, because the return address points
+ // into the epilogue. To cope with that, we insert an epilogue marker here,
+ // then replace it with a 'nop' if it ends up immediately after a CALL in the
+ // final emitted code.
+ if (NeedsWinCFI)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+
+ // Add the return addr area delta back since we are not tail calling.
+ int Offset = -1 * X86FI->getTCReturnAddrDelta();
+ assert(Offset >= 0 && "TCDelta should never be positive");
+ if (Offset) {
+ MBBI = MBB.getFirstTerminator();
+
+ // Check for possible merge with preceding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, true);
+ emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ }
+}
+
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (TRI->hasBasePointer(MF))
+ FrameReg = TRI->getBaseRegister();
+ else if (TRI->needsStackRealignment(MF))
+ FrameReg = TRI->getStackRegister();
+ else
+ FrameReg = TRI->getFrameRegister(MF);
+
+ // Offset will hold the offset from the stack pointer at function entry to the
+ // object.
+ // We need to factor in additional offsets applied during the prologue to the
+ // frame, base, and stack pointer depending on which is used.
+ int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t StackSize = MFI->getStackSize();
+ bool HasFP = hasFP(MF);
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ int64_t FPDelta = 0;
+
+ if (IsWin64Prologue) {
+ assert(!MFI->hasCalls() || (StackSize % 16) == 8);
+
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base pointer.
+ if (X86FI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+ uint64_t NumBytes = FrameSize - CSSize;
+
+ uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
+ if (FI && FI == X86FI->getFAIndex())
+ return -SEHFrameOffset;
+
+ // FPDelta is the offset from the "traditional" FP location of the old base
+ // pointer followed by return address and the location required by the
+ // restricted Win64 prologue.
+ // Add FPDelta to all offsets below that go through the frame pointer.
+ FPDelta = FrameSize - SEHFrameOffset;
+ assert((!MFI->hasCalls() || (FPDelta % 16) == 0) &&
+ "FPDelta isn't aligned per the Win64 ABI!");
+ }
+
+
+ if (TRI->hasBasePointer(MF)) {
+ assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + SlotSize + FPDelta;
+ } else {
+ assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ } else if (TRI->needsStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + SlotSize + FPDelta;
+ } else {
+ assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ // FIXME: Support tail calls
+ } else {
+ if (!HasFP)
+ return Offset + StackSize;
+
+ // Skip the saved EBP.
+ Offset += SlotSize;
+
+ // Skip the RETADDR move area
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ Offset -= TailCallReturnAddrDelta;
+ }
+
+ return Offset + FPDelta;
+}
+
+// Simplified from getFrameIndexReference keeping only StackPointer cases
+int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ // Does not include any dynamic realign.
+ const uint64_t StackSize = MFI->getStackSize();
+ {
+#ifndef NDEBUG
+ // LLVM arranges the stack as follows:
+ // ...
+ // ARG2
+ // ARG1
+ // RETADDR
+ // PUSH RBP <-- RBP points here
+ // PUSH CSRs
+ // ~~~~~~~ <-- possible stack realignment (non-win64)
+ // ...
+ // STACK OBJECTS
+ // ... <-- RSP after prologue points here
+ // ~~~~~~~ <-- possible stack realignment (win64)
+ //
+ // if (hasVarSizedObjects()):
+ // ... <-- "base pointer" (ESI/RBX) points here
+ // DYNAMIC ALLOCAS
+ // ... <-- RSP points here
+ //
+ // Case 1: In the simple case of no stack realignment and no dynamic
+ // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+ // with fixed offsets from RSP.
+ //
+ // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+ // stack objects are addressed with RBP and regular stack objects with RSP.
+ //
+ // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+ // to address stack arguments for outgoing calls and nothing else. The "base
+ // pointer" points to local variables, and RBP points to fixed objects.
+ //
+ // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+ // answer we give is relative to the SP after the prologue, and not the
+ // SP in the middle of the function.
+
+ assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) ||
+ STI.isTargetWin64()) &&
+ "offset from fixed object to SP is not static");
+
+ // We don't handle tail calls, and shouldn't be seeing them either.
+ int TailCallReturnAddrDelta =
+ MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
+ assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
+#endif
+ }
+
+ // Fill in FrameReg output argument.
+ FrameReg = TRI->getStackRegister();
+
+ // This is how the math works out:
+ //
+ // %rsp grows (i.e. gets lower) left to right. Each box below is
+ // one word (eight bytes). Obj0 is the stack slot we're trying to
+ // get to.
+ //
+ // ----------------------------------
+ // | BP | Obj0 | Obj1 | ... | ObjN |
+ // ----------------------------------
+ // ^ ^ ^ ^
+ // A B C E
+ //
+ // A is the incoming stack pointer.
+ // (B - A) is the local area offset (-8 for x86-64) [1]
+ // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
+ //
+ // |(E - B)| is the StackSize (absolute value, positive). For a
+ // stack that grown down, this works out to be (B - E). [3]
+ //
+ // E is also the value of %rsp after stack has been set up, and we
+ // want (C - E) -- the value we can add to %rsp to get to Obj0. Now
+ // (C - E) == (C - A) - (B - A) + (B - E)
+ // { Using [1], [2] and [3] above }
+ // == getObjectOffset - LocalAreaOffset + StackSize
+ //
+
+ // Get the Offset from the StackPointer
+ int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+
+ return Offset + StackSize;
+}
+
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+ unsigned CalleeSavedFrameSize = 0;
+ int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+
+ if (hasFP(MF)) {
+ // emitPrologue always spills frame register the first thing.
+ SpillSlotOffset -= SlotSize;
+ MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+ // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+ // the frame register, we can delete it from CSI list and not have to worry
+ // about avoiding it later.
+ unsigned FPReg = TRI->getFrameRegister(MF);
+ for (unsigned i = 0; i < CSI.size(); ++i) {
+ if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
+ CSI.erase(CSI.begin() + i);
+ break;
+ }
+ }
+ }
+
+ // Assign slots for GPRs. It increases frame size.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ SpillSlotOffset -= SlotSize;
+ CalleeSavedFrameSize += SlotSize;
+
+ int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ }
+
+ X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+ // Assign slots for XMMs.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ // ensure alignment
+ SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
+ // spill into slot
+ SpillSlotOffset -= RC->getSize();
+ int SlotIndex =
+ MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ MFI->ensureMaxAlignment(RC->getAlignment());
+ }
+
+ return true;
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+ // for us, and there are no XMM CSRs on Win32.
+ if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+ return true;
+
+ // Push GPRs. It increases frame size.
+ unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+
+ BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+ // It can be done by spilling XMMs to stack frame.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+ TRI);
+ --MI;
+ MI->setFlag(MachineInstr::FrameSetup);
+ ++MI;
+ }
+
+ return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ if (isFuncletReturnInstr(MI) && STI.isOSWindows()) {
+ // Don't restore CSRs in 32-bit EH funclets. Matches
+ // spillCalleeSavedRegisters.
+ if (STI.is32Bit())
+ return true;
+ // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+ // funclets. emitEpilogue transforms these to normal jumps.
+ if (MI->getOpcode() == X86::CATCHRET) {
+ const Function *Func = MBB.getParent()->getFunction();
+ bool IsSEH = isAsynchronousEHPersonality(
+ classifyEHPersonality(Func->getPersonalityFn()));
+ if (IsSEH)
+ return true;
+ }
+ }
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ // Reload XMMs from stack frame.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+ }
+
+ // POP GPRs.
+ unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (!X86::GR64RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+ return true;
+}
+
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+ TailCallReturnAddrDelta - SlotSize, true);
+ }
+
+ // Spill the BasePtr if it's used.
+ if (TRI->hasBasePointer(MF)) {
+ SavedRegs.set(TRI->getBaseRegister());
+
+ // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+ if (MF.getMMI().hasEHFunclets()) {
+ int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize);
+ X86FI->setHasSEHFramePtrSave(true);
+ X86FI->setSEHFramePtrSaveIndex(FI);
+ }
+ }
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+ const Function *F = MF->getFunction();
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; I++) {
+ if (I->hasNestAttr())
+ return true;
+ }
+ return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
+ CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+ // Erlang stuff.
+ if (CallingConvention == CallingConv::HiPE) {
+ if (Is64Bit)
+ return Primary ? X86::R14 : X86::R13;
+ else
+ return Primary ? X86::EBX : X86::EDI;
+ }
+
+ if (Is64Bit) {
+ if (IsLP64)
+ return Primary ? X86::R11 : X86::R12;
+ else
+ return Primary ? X86::R11D : X86::R12D;
+ }
+
+ bool IsNested = HasNestArgument(&MF);
+
+ if (CallingConvention == CallingConv::X86_FastCall ||
+ CallingConvention == CallingConv::Fast) {
+ if (IsNested)
+ report_fatal_error("Segmented stacks does not support fastcall with "
+ "nested function.");
+ return Primary ? X86::EAX : X86::ECX;
+ }
+ if (IsNested)
+ return Primary ? X86::EDX : X86::EAX;
+ return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void X86FrameLowering::adjustForSegmentedStacks(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ uint64_t StackSize;
+ unsigned TlsReg, TlsOffset;
+ DebugLoc DL;
+
+ unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "Scratch register is live-in");
+
+ if (MF.getFunction()->isVarArg())
+ report_fatal_error("Segmented stacks do not support vararg functions.");
+ if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
+ !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
+ !STI.isTargetDragonFly())
+ report_fatal_error("Segmented stacks not supported on this platform.");
+
+ // Eventually StackSize will be calculated by a link-time pass; which will
+ // also decide whether checking code needs to be injected into this particular
+ // prologue.
+ StackSize = MFI->getStackSize();
+
+ // Do not generate a prologue for functions with a stack of size zero
+ if (StackSize == 0)
+ return;
+
+ MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ bool IsNested = false;
+
+ // We need to know if the function has a nest argument only in 64 bit mode.
+ if (Is64Bit)
+ IsNested = HasNestArgument(&MF);
+
+ // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+ // allocMBB needs to be last (terminating) instruction.
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ allocMBB->addLiveIn(LI);
+ checkMBB->addLiveIn(LI);
+ }
+
+ if (IsNested)
+ allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
+
+ MF.push_front(allocMBB);
+ MF.push_front(checkMBB);
+
+ // When the frame size is less than 256 we just compare the stack
+ // boundary directly to the value of the stack pointer, per gcc.
+ bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+ // Read the limit off the current stacklet off the stack_guard location.
+ if (Is64Bit) {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::FS;
+ TlsOffset = IsLP64 ? 0x70 : 0x40;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+ } else if (STI.isTargetWin64()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x28; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetFreeBSD()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x18;
+ } else if (STI.isTargetDragonFly()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x20; // use tls_tcb.tcb_segstack
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x30;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x48 + 90*4;
+ } else if (STI.isTargetWin32()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x14; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetDragonFly()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x10; // use tls_tcb.tcb_segstack
+ } else if (STI.isTargetFreeBSD()) {
+ report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
+ STI.isTargetDragonFly()) {
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else if (STI.isTargetDarwin()) {
+
+ // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
+ unsigned ScratchReg2;
+ bool SaveScratch2;
+ if (CompareStackPointer) {
+ // The primary scratch register is available for holding the TLS offset.
+ ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ SaveScratch2 = false;
+ } else {
+ // Need to use a second register to hold the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
+
+ // Unfortunately, with fastcc the second scratch register may hold an
+ // argument.
+ SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+ }
+
+ // If Scratch2 is live-in then it needs to be saved.
+ assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+ "Scratch register is live-in and not saved");
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+ .addReg(ScratchReg2, RegState::Kill);
+
+ BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+ .addImm(TlsOffset);
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+ .addReg(ScratchReg)
+ .addReg(ScratchReg2).addImm(1).addReg(0)
+ .addImm(0)
+ .addReg(TlsReg);
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+ }
+ }
+
+ // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+ // It jumps to normal execution of the function body.
+ BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);
+
+ // On 32 bit we first push the arguments size and then the frame size. On 64
+ // bit, we pass the stack frame size in r10 and the argument size in r11.
+ if (Is64Bit) {
+ // Functions with nested arguments use R10, so it needs to be saved across
+ // the call to _morestack
+
+ const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
+ const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
+ const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
+ const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
+ const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
+
+ BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
+ .addImm(StackSize);
+ BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
+ .addImm(X86FI->getArgumentStackSize());
+ } else {
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(X86FI->getArgumentStackSize());
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(StackSize);
+ }
+
+ // __morestack is in libgcc
+ if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+ // Under the large code model, we cannot assume that __morestack lives
+ // within 2^31 bytes of the call site, so we cannot use pc-relative
+ // addressing. We cannot perform the call via a temporary register,
+ // as the rax register may be used to store the static chain, and all
+ // other suitable registers may be either callee-save or used for
+ // parameter passing. We cannot use the stack at this point either
+ // because __morestack manipulates the stack directly.
+ //
+ // To avoid these issues, perform an indirect call via a read-only memory
+ // location containing the address.
+ //
+ // This solution is not perfect, as it assumes that the .rodata section
+ // is laid out within 2^31 bytes of each function body, but this seems
+ // to be sufficient for JIT.
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addExternalSymbol("__morestack_addr")
+ .addReg(0);
+ MF.getMMI().setUsesMorestackAddr(true);
+ } else {
+ if (Is64Bit)
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack");
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack");
+ }
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+ allocMBB->addSuccessor(&PrologueMBB);
+
+ checkMBB->addSuccessor(allocMBB);
+ checkMBB->addSuccessor(&PrologueMBB);
+
+#ifdef XDEBUG
+ MF.verify();
+#endif
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+/// ...
+/// IncStack:
+/// call inc_stack # doubles the stack space
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ DebugLoc DL;
+ // HiPE-specific values
+ const unsigned HipeLeafWords = 24;
+ const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+ const unsigned Guaranteed = HipeLeafWords * SlotSize;
+ unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+ MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+ unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+ assert(STI.isTargetLinux() &&
+ "HiPE prologue is only supported on Linux operating systems.");
+
+ // Compute the largest caller's frame that is needed to fit the callees'
+ // frames. This 'MaxStack' is computed from:
+ //
+ // a) the fixed frame size, which is the space needed for all spilled temps,
+ // b) outgoing on-stack parameter areas, and
+ // c) the minimum stack space this function needs to make available for the
+ // functions it calls (a tunable ABI property).
+ if (MFI->hasCalls()) {
+ unsigned MoreStackForCalls = 0;
+
+ for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+ MBBI != MBBE; ++MBBI)
+ for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+ MI != ME; ++MI) {
+ if (!MI->isCall())
+ continue;
+
+ // Get callee operand.
+ const MachineOperand &MO = MI->getOperand(0);
+
+ // Only take account of global function calls (no closures etc.).
+ if (!MO.isGlobal())
+ continue;
+
+ const Function *F = dyn_cast<Function>(MO.getGlobal());
+ if (!F)
+ continue;
+
+ // Do not update 'MaxStack' for primitive and built-in functions
+ // (encoded with names either starting with "erlang."/"bif_" or not
+ // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+ // "_", such as the BIF "suspend_0") as they are executed on another
+ // stack.
+ if (F->getName().find("erlang.") != StringRef::npos ||
+ F->getName().find("bif_") != StringRef::npos ||
+ F->getName().find_first_of("._") == StringRef::npos)
+ continue;
+
+ unsigned CalleeStkArity =
+ F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+ if (HipeLeafWords - 1 > CalleeStkArity)
+ MoreStackForCalls = std::max(MoreStackForCalls,
+ (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+ }
+ MaxStack += MoreStackForCalls;
+ }
+
+ // If the stack frame needed is larger than the guaranteed then runtime checks
+ // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+ if (MaxStack > Guaranteed) {
+ MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ stackCheckMBB->addLiveIn(LI);
+ incStackMBB->addLiveIn(LI);
+ }
+
+ MF.push_front(incStackMBB);
+ MF.push_front(stackCheckMBB);
+
+ unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+ unsigned LEAop, CMPop, CALLop;
+ if (Is64Bit) {
+ SPReg = X86::RSP;
+ PReg = X86::RBP;
+ LEAop = X86::LEA64r;
+ CMPop = X86::CMP64rm;
+ CALLop = X86::CALL64pcrel32;
+ SPLimitOffset = 0x90;
+ } else {
+ SPReg = X86::ESP;
+ PReg = X86::EBP;
+ LEAop = X86::LEA32r;
+ CMPop = X86::CMP32rm;
+ CALLop = X86::CALLpcrel32;
+ SPLimitOffset = 0x4c;
+ }
+
+ ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "HiPE prologue scratch register is live-in");
+
+ // Create new MBB for StackCheck:
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ // SPLimitOffset is in a fixed heap location (pointed by BP).
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);
+
+ // Create new MBB for IncStack:
+ BuildMI(incStackMBB, DL, TII.get(CALLop)).
+ addExternalSymbol("inc_stack_0");
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
+
+ stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+ stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+ incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+ incStackMBB->addSuccessor(incStackMBB, {1, 100});
+ }
+#ifdef XDEBUG
+ MF.verify();
+#endif
+}
+
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const {
+
+ if (Offset <= 0)
+ return false;
+
+ if (Offset % SlotSize)
+ return false;
+
+ int NumPops = Offset / SlotSize;
+ // This is only worth it if we have at most 2 pops.
+ if (NumPops != 1 && NumPops != 2)
+ return false;
+
+ // Handle only the trivial case where the adjustment directly follows
+ // a call. This is the most common one, anyway.
+ if (MBBI == MBB.begin())
+ return false;
+ MachineBasicBlock::iterator Prev = std::prev(MBBI);
+ if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+ return false;
+
+ unsigned Regs[2];
+ unsigned FoundRegs = 0;
+
+ auto RegMask = Prev->getOperand(1);
+
+ auto &RegClass =
+ Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+ // Try to find up to NumPops free registers.
+ for (auto Candidate : RegClass) {
+
+ // Poor man's liveness:
+ // Since we're immediately after a call, any register that is clobbered
+ // by the call and not defined by it can be considered dead.
+ if (!RegMask.clobbersPhysReg(Candidate))
+ continue;
+
+ bool IsDef = false;
+ for (const MachineOperand &MO : Prev->implicit_operands()) {
+ if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) {
+ IsDef = true;
+ break;
+ }
+ }
+
+ if (IsDef)
+ continue;
+
+ Regs[FoundRegs++] = Candidate;
+ if (FoundRegs == (unsigned)NumPops)
+ break;
+ }
+
+ if (FoundRegs == 0)
+ return false;
+
+ // If we found only one free register, but need two, reuse the same one twice.
+ while (FoundRegs < (unsigned)NumPops)
+ Regs[FoundRegs++] = Regs[0];
+
+ for (int i = 0; i < NumPops; ++i)
+ BuildMI(MBB, MBBI, DL,
+ TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+ return true;
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ bool reserveCallFrame = hasReservedCallFrame(MF);
+ unsigned Opcode = I->getOpcode();
+ bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ DebugLoc DL = I->getDebugLoc();
+ uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
+ uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+ I = MBB.erase(I);
+
+ if (!reserveCallFrame) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned StackAlign = getStackAlignment();
+ Amount = RoundUpToAlignment(Amount, StackAlign);
+
+ MachineModuleInfo &MMI = MF.getMMI();
+ const Function *Fn = MF.getFunction();
+ bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool DwarfCFI = !WindowsCFI &&
+ (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
+ // If we have any exception handlers in this function, and we adjust
+ // the SP before calls, we may need to indicate this to the unwinder
+ // using GNU_ARGS_SIZE. Note that this may be necessary even when
+ // Amount == 0, because the preceding function may have set a non-0
+ // GNU_ARGS_SIZE.
+ // TODO: We don't need to reset this between subsequent functions,
+ // if it didn't change.
+ bool HasDwarfEHHandlers = !WindowsCFI &&
+ !MF.getMMI().getLandingPads().empty();
+
+ if (HasDwarfEHHandlers && !isDestroy &&
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+ if (Amount == 0)
+ return;
+
+ // Factor out the amount that gets handled inside the sequence
+ // (Pushes of argument for frame setup, callee pops for frame destroy)
+ Amount -= InternalAmt;
+
+ // TODO: This is needed only if we require precise CFA.
+ // If this is a callee-pop calling convention, emit a CFA adjust for
+ // the amount the callee popped.
+ if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
+ if (Amount) {
+ // Add Amount to SP to destroy a frame, and subtract to setup.
+ int Offset = isDestroy ? Amount : -Amount;
+
+ if (!(Fn->optForMinSize() &&
+ adjustStackWithPops(MBB, I, DL, Offset)))
+ BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
+ }
+
+ if (DwarfCFI && !hasFP(MF)) {
+ // If we don't have FP, but need to generate unwind information,
+ // we need to set the correct CFA offset after the stack adjustment.
+ // How much we adjust the CFA offset depends on whether we're emitting
+ // CFI only for EH purposes or for debugging. EH only requires the CFA
+ // offset to be correct at each call site, while for debugging we want
+ // it to be more precise.
+ int CFAOffset = Amount;
+ // TODO: When not using precise CFA, we also need to adjust for the
+ // InternalAmt here.
+
+ if (CFAOffset) {
+ CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+ }
+ }
+
+ return;
+ }
+
+ if (isDestroy && InternalAmt) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ // We are not tracking the stack pointer adjustment by the callee, so make
+ // sure we restore the stack pointer immediately after the call, there may
+ // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+ MachineBasicBlock::iterator B = MBB.begin();
+ while (I != B && !std::prev(I)->isCall())
+ --I;
+ BuildStackAdjustment(MBB, I, DL, -InternalAmt, /*InEpilogue=*/false);
+ }
+}
+
+bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ assert(MBB.getParent() && "Block is not attached to a function!");
+
+ // Win64 has strict requirements in terms of epilogue and we are
+ // not taking a chance at messing with them.
+ // I.e., unless this block is already an exit block, we can't use
+ // it as an epilogue.
+ if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+ return false;
+
+ if (canUseLEAForSPInEpilogue(*MBB.getParent()))
+ return true;
+
+ // If we cannot use LEA to adjust SP, we may need to use ADD, which
+ // clobbers the EFLAGS. Check that we do not need to preserve it,
+ // otherwise, conservatively assume this is not
+ // safe to insert the epilogue here.
+ return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ // If we may need to emit frameless compact unwind information, give
+ // up as this is currently broken: PR25614.
+ return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool RestoreSP) const {
+ assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+ assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+ assert(STI.is32Bit() && !Uses64BitFramePtr &&
+ "restoring EBP/ESI on non-32-bit target");
+
+ MachineFunction &MF = *MBB.getParent();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ unsigned BasePtr = TRI->getBaseRegister();
+ WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // FIXME: Don't set FrameSetup flag in catchret case.
+
+ int FI = FuncInfo.EHRegNodeFrameIndex;
+ int EHRegSize = MFI->getObjectSize(FI);
+
+ if (RestoreSP) {
+ // MOV32rm -EHRegSize(%ebp), %esp
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+ X86::EBP, true, -EHRegSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ unsigned UsedReg;
+ int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+ int EndOffset = -EHRegOffset - EHRegSize;
+ FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+ if (UsedReg == FramePtr) {
+ // ADD $offset, %ebp
+ unsigned ADDri = getADDriOpcode(false, EndOffset);
+ BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+ .addReg(FramePtr)
+ .addImm(EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup)
+ ->getOperand(3)
+ .setIsDead();
+ assert(EndOffset >= 0 &&
+ "end of registration object above normal EBP position!");
+ } else if (UsedReg == BasePtr) {
+ // LEA offset(%ebp), %esi
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+ FramePtr, false, EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ // MOV32rm SavedEBPOffset(%esi), %ebp
+ assert(X86FI->getHasSEHFramePtrSave());
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+ UsedReg, true, Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+ }
+ return MBBI;
+}
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+ // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+ unsigned Offset = 16;
+ // RBP is immediately pushed.
+ Offset += SlotSize;
+ // All callee-saved registers are then pushed.
+ Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // Every funclet allocates enough stack space for the largest outgoing call.
+ Offset += getWinEHFuncletFrameSize(MF);
+ return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF, RegScavenger *RS) const {
+ // If this function isn't doing Win64-style C++ EH, we don't need to do
+ // anything.
+ const Function *Fn = MF.getFunction();
+ if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() ||
+ classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+ return;
+
+ // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+ // relative to RSP after the prologue. Find the offset of the last fixed
+ // object, so that we can allocate a slot immediately following it. If there
+ // were no fixed objects, use offset -SlotSize, which is immediately after the
+ // return address. Fixed objects have negative frame indices.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ int64_t MinFixedObjOffset = -SlotSize;
+ for (int I = MFI->getObjectIndexBegin(); I < 0; ++I)
+ MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I));
+
+ int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+ int UnwindHelpFI =
+ MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+ MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI;
+
+ // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+ // other frame setup instructions.
+ MachineBasicBlock &MBB = MF.front();
+ auto MBBI = MBB.begin();
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+ ++MBBI;
+
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+ UnwindHelpFI)
+ .addImm(-2);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 0000000..3ab41b4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,203 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCCFIInstruction;
+class X86Subtarget;
+class X86RegisterInfo;
+
+class X86FrameLowering : public TargetFrameLowering {
+public:
+ X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+
+ // Cached subtarget predicates.
+
+ const X86Subtarget &STI;
+ const TargetInstrInfo &TII;
+ const X86RegisterInfo *TRI;
+
+ unsigned SlotSize;
+
+ /// Is64Bit implies that x86_64 instructions are available.
+ bool Is64Bit;
+
+ bool IsLP64;
+
+ /// True if the 64-bit frame or stack pointer should be used. True for most
+ /// 64-bit targets with the exception of x32. If this is false, 32-bit
+ /// instruction operands should be used to manipulate StackPtr and FramePtr.
+ bool Uses64BitFramePtr;
+
+ unsigned StackPtr;
+
+ /// Emit target stack probe code. This is required for all
+ /// large stack allocations on Windows. The caller is required to materialize
+ /// the number of bytes to probe in RAX/EAX. Returns instruction just
+ /// after the expansion.
+ MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ bool InProlog) const;
+
+ /// Replace a StackProbe inline-stub with the actual probe code inline.
+ void inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const override;
+
+ void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) const;
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ void adjustForSegmentedStacks(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
+ void adjustForHiPEPrologue(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
+ /// Check the instruction before/after the passed instruction. If
+ /// it is an ADD/SUB/LEA instruction it is deleted argument and the
+ /// stack adjustment is returned as a positive value for ADD/LEA and
+ /// a negative for SUB.
+ int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const;
+
+ /// Emit a series of instructions to increment / decrement the stack
+ /// pointer by a constant value.
+ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ int64_t NumBytes, bool InEpilogue) const;
+
+ /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
+ bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
+
+ /// Check whether or not the given \p MBB can be used as a epilogue
+ /// for the target.
+ /// The epilogue will be inserted before the first terminator of that block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+ /// convertArgMovsToPushes - This method tries to convert a call sequence
+ /// that uses sub and mov instructions to put the argument onto the stack
+ /// into a series of pushes.
+ /// Returns true if the transformation succeeded, false if not.
+ bool convertArgMovsToPushes(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ uint64_t Amount) const;
+
+ /// Wraps up getting a CFI index and building a MachineInstr for it.
+ void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, MCCFIInstruction CFIInst) const;
+
+ /// Sets up EBP and optionally ESI based on the incoming EBP value. Only
+ /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+ MachineBasicBlock::iterator
+ restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ bool RestoreSP = false) const;
+
+private:
+ uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+ /// Emit target stack probe as a call to a helper function
+ MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Emit target stack probe as an inline sequence.
+ MachineInstr *emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Emit a stub to later inline the target stack probe.
+ MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Aligns the stack pointer by ANDing it with -MaxAlign.
+ void BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ unsigned Reg, uint64_t MaxAlign) const;
+
+ /// Make small positive stack adjustments using POPs.
+ bool adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ int Offset) const;
+
+ /// Adjusts the stack pointer using LEA, SUB, or ADD.
+ MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, int64_t Offset,
+ bool InEpilogue) const;
+
+ unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+ unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..868ae4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,3012 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <stdint.h>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+ /// numbers for the leaves of the matched tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ // This is really a union, discriminated by BaseType!
+ SDValue Base_Reg;
+ int Base_FrameIndex;
+
+ unsigned Scale;
+ SDValue IndexReg;
+ int32_t Disp;
+ SDValue Segment;
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ MCSymbol *MCSym;
+ int JT;
+ unsigned Align; // CP alignment.
+ unsigned char SymbolFlags; // X86II::MO_*
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+ Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+ MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
+
+ bool hasSymbolicDisplacement() const {
+ return GV != nullptr || CP != nullptr || ES != nullptr ||
+ MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
+ }
+
+ bool hasBaseOrIndexReg() const {
+ return BaseType == FrameIndexBase ||
+ IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
+ }
+
+ /// Return true if this addressing mode is already RIP-relative.
+ bool isRIPRelative() const {
+ if (BaseType != RegBase) return false;
+ if (RegisterSDNode *RegNode =
+ dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+ return RegNode->getReg() == X86::RIP;
+ return false;
+ }
+
+ void setBaseReg(SDValue Reg) {
+ BaseType = RegBase;
+ Base_Reg = Reg;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ dbgs() << "X86ISelAddressMode " << this << '\n';
+ dbgs() << "Base_Reg ";
+ if (Base_Reg.getNode())
+ Base_Reg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+ << " Scale" << Scale << '\n'
+ << "IndexReg ";
+ if (IndexReg.getNode())
+ IndexReg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Disp " << Disp << '\n'
+ << "GV ";
+ if (GV)
+ GV->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " CP ";
+ if (CP)
+ CP->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << '\n'
+ << "ES ";
+ if (ES)
+ dbgs() << ES;
+ else
+ dbgs() << "nul";
+ dbgs() << " MCSym ";
+ if (MCSym)
+ dbgs() << MCSym;
+ else
+ dbgs() << "nul";
+ dbgs() << " JT" << JT << " Align" << Align << '\n';
+ }
+#endif
+ };
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86-specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class X86DAGToDAGISel final : public SelectionDAGISel {
+ /// Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// If true, selector should try to optimize for code size instead of
+ /// performance.
+ bool OptForSize;
+
+ public:
+ explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
+
+ const char *getPassName() const override {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+ SelectionDAGISel::runOnMachineFunction(MF);
+ return true;
+ }
+
+ void EmitFunctionEntryCode() override;
+
+ bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+
+ void PreprocessISelDAG() override;
+
+ inline bool immSext8(SDNode *N) const {
+ return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+ }
+
+ // True if the 64-bit immediate fits in a 32-bit sign-extended field.
+ inline bool i64immSExt32(SDNode *N) const {
+ uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+ return (int64_t)v == (int32_t)v;
+ }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ SDNode *Select(SDNode *N) override;
+ SDNode *selectGather(SDNode *N, unsigned Opc);
+ SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT);
+
+ bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+ bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+ bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth);
+ bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+ bool selectLEAAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectScalarSSELoad(SDNode *Root, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment,
+ SDValue &NodeWithChain);
+
+ bool tryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
+ /// Implement addressing mode selection for inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ void emitSpecialCodeForMain();
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ ? CurDAG->getTargetFrameIndex(
+ AM.Base_FrameIndex,
+ TLI->getPointerTy(CurDAG->getDataLayout()))
+ : AM.Base_Reg;
+ Scale = getI8Imm(AM.Scale, DL);
+ Index = AM.IndexReg;
+ // These are 32-bit even in 64-bit mode since RIP-relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
+ MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+ AM.Align, AM.Disp, AM.SymbolFlags);
+ else if (AM.ES) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+ } else if (AM.MCSym) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
+ assert(AM.SymbolFlags == 0 && "oo");
+ Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
+ } else if (AM.JT != -1) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+ } else if (AM.BlockAddr)
+ Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
+
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ }
+
+ // Utility function to determine whether we should avoid selecting
+ // immediate forms of instructions for better code size or not.
+ // At a high level, we'd like to avoid such instructions when
+ // we have similar constants used within the same basic block
+ // that can be kept in a register.
+ //
+ bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+ uint32_t UseCount = 0;
+
+ // Do not want to hoist if we're not optimizing for size.
+ // TODO: We'd like to remove this restriction.
+ // See the comment in X86InstrInfo.td for more info.
+ if (!OptForSize)
+ return false;
+
+ // Walk all the users of the immediate.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+ SDNode *User = *UI;
+
+ // This user is already selected. Count it as a legitimate use and
+ // move on.
+ if (User->isMachineOpcode()) {
+ UseCount++;
+ continue;
+ }
+
+ // We want to count stores of immediates as real uses.
+ if (User->getOpcode() == ISD::STORE &&
+ User->getOperand(1).getNode() == N) {
+ UseCount++;
+ continue;
+ }
+
+ // We don't currently match users that have > 2 operands (except
+ // for stores, which are handled above)
+ // Those instruction won't match in ISEL, for now, and would
+ // be counted incorrectly.
+ // This may change in the future as we add additional instruction
+ // types.
+ if (User->getNumOperands() != 2)
+ continue;
+
+ // Immediates that are used for offsets as part of stack
+ // manipulation should be left alone. These are typically
+ // used to indicate SP offsets for argument passing and
+ // will get pulled into stores/pushes (implicitly).
+ if (User->getOpcode() == X86ISD::ADD ||
+ User->getOpcode() == ISD::ADD ||
+ User->getOpcode() == X86ISD::SUB ||
+ User->getOpcode() == ISD::SUB) {
+
+ // Find the other operand of the add/sub.
+ SDValue OtherOp = User->getOperand(0);
+ if (OtherOp.getNode() == N)
+ OtherOp = User->getOperand(1);
+
+ // Don't count if the other operand is SP.
+ RegisterSDNode *RegNode;
+ if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+ (RegNode = dyn_cast_or_null<RegisterSDNode>(
+ OtherOp->getOperand(1).getNode())))
+ if ((RegNode->getReg() == X86::ESP) ||
+ (RegNode->getReg() == X86::RSP))
+ continue;
+ }
+
+ // ... otherwise, count this and move on.
+ UseCount++;
+ }
+
+ // If we have more than 1 use, then recommend for hoisting.
+ return (UseCount > 1);
+ }
+
+ /// Return a target constant with the specified value of type i8.
+ inline SDValue getI8Imm(unsigned Imm, SDLoc DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+ }
+
+ /// Return a target constant with the specified value, of type i32.
+ inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+ }
+
+ /// Return an SDNode that returns the value of the global base register.
+ /// Output instructions required to initialize the global base register,
+ /// if necessary.
+ SDNode *getGlobalBaseReg();
+
+ /// Return a reference to the TargetMachine, casted to the target-specific
+ /// type.
+ const X86TargetMachine &getTargetMachine() const {
+ return static_cast<const X86TargetMachine &>(TM);
+ }
+
+ /// Return a reference to the TargetInstrInfo, casted to the target-specific
+ /// type.
+ const X86InstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+
+ /// \brief Address-mode matching performs shift-of-and to and-of-shift
+ /// reassociation in order to expose more scaled addressing
+ /// opportunities.
+ bool ComplexPatternFuncMutatesDAG() const override {
+ return true;
+ }
+ };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+ if (OptLevel == CodeGenOpt::None) return false;
+
+ if (!N.hasOneUse())
+ return false;
+
+ if (N.getOpcode() != ISD::LOAD)
+ return true;
+
+ // If N is a load, do additional profitability checks.
+ if (U == Root) {
+ switch (U->getOpcode()) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::AND:
+ case X86ISD::XOR:
+ case X86ISD::OR:
+ case ISD::ADD:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue Op1 = U->getOperand(1);
+
+ // If the other operand is a 8-bit immediate we should fold the immediate
+ // instead. This reduces code size.
+ // e.g.
+ // movl 4(%esp), %eax
+ // addl $4, %eax
+ // vs.
+ // movl $4, %eax
+ // addl 4(%esp), %eax
+ // The former is 2 bytes shorter. In case where the increment is 1, then
+ // the saving can be 4 bytes (by using incl %eax).
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (Imm->getAPIntValue().isSignedIntN(8))
+ return false;
+
+ // If the other operand is a TLS address, we should fold it instead.
+ // This produces
+ // movl %gs:0, %eax
+ // leal i@NTPOFF(%eax), %eax
+ // instead of
+ // movl $i@NTPOFF, %eax
+ // addl %gs:0, %eax
+ // if the block also has an access to a second TLS address this will save
+ // a load.
+ // FIXME: This is probably also true for non-TLS addresses.
+ if (Op1.getOpcode() == X86ISD::Wrapper) {
+ SDValue Val = Op1.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/// Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Call, SDValue OrigChain) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Chain = OrigChain.getOperand(0);
+ if (Chain.getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else {
+ assert(Chain.getOpcode() == ISD::TokenFactor &&
+ "Unexpected chain operand");
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+ if (Chain.getOperand(i).getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(Chain.getOperand(i));
+ SDValue NewChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
+ Ops.clear();
+ Ops.push_back(NewChain);
+ }
+ Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
+ CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
+ CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+ Load.getOperand(1), Load.getOperand(2));
+
+ Ops.clear();
+ Ops.push_back(SDValue(Load.getNode(), 1));
+ Ops.append(Call->op_begin() + 1, Call->op_end());
+ CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
+}
+
+/// Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+ // The transformation is somewhat dangerous if the call's chain was glued to
+ // the call. After MoveBelowOrigChain the load is moved between the call and
+ // the chain, this can create a cycle if the load is not folded. So it is
+ // *really* important that we are sure the load will be folded.
+ if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+ return false;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+ if (!LD ||
+ LD->isVolatile() ||
+ LD->getAddressingMode() != ISD::UNINDEXED ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ // Now let's find the callseq_start.
+ while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+ if (!Chain.hasOneUse())
+ return false;
+ Chain = Chain.getOperand(0);
+ }
+
+ if (!Chain.getNumOperands())
+ return false;
+ // Since we are not checking for AA here, conservatively abort if the chain
+ // writes to memory. It's not safe to move the callee (a load) across a store.
+ if (isa<MemSDNode>(Chain.getNode()) &&
+ cast<MemSDNode>(Chain.getNode())->writeMem())
+ return false;
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+ Callee.getValue(1).hasOneUse())
+ return true;
+ return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+ // OptForSize is used in pattern predicates that isel is matching.
+ OptForSize = MF->getFunction()->optForSize();
+
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+ if (OptLevel != CodeGenOpt::None &&
+ // Only does this when target favors doesn't favor register indirect
+ // call.
+ ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+ (N->getOpcode() == X86ISD::TC_RETURN &&
+ // Only does this if load can be folded into TC_RETURN.
+ (Subtarget->is64Bit() ||
+ getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
+ /// Also try moving call address load from outside callseq_start to just
+ /// before the call to allow it to be folded.
+ ///
+ /// [Load chain]
+ /// ^
+ /// |
+ /// [Load]
+ /// ^ ^
+ /// | |
+ /// / \--
+ /// / |
+ ///[CALLSEQ_START] |
+ /// ^ |
+ /// | |
+ /// [LOAD/C2Reg] |
+ /// | |
+ /// \ /
+ /// \ /
+ /// [CALL]
+ bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+ SDValue Chain = N->getOperand(0);
+ SDValue Load = N->getOperand(1);
+ if (!isCalleeLoad(Load, Chain, HasCallSeq))
+ continue;
+ moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+ ++NumLoadMoved;
+ continue;
+ }
+
+ // Lower fpround and fpextend nodes that target the FP stack to be store and
+ // load to the stack. This is a gross hack. We would like to simply mark
+ // these as being illegal, but when we do that, legalize produces these when
+ // it expands calls, then expands these in the same legalize pass. We would
+ // like dag combine to be able to hack on these between the call expansion
+ // and the node legalization. As such this pass basically does "really
+ // late" legalization of these inline with the X86 isel pass.
+ // FIXME: This should only happen when not compiled with -O0.
+ if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+ continue;
+
+ MVT SrcVT = N->getOperand(0).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ SDLoc dl(N);
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+ SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+ N->getOperand(0),
+ MemTmp, MachinePointerInfo(), MemVT,
+ false, false, 0);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(),
+ MemVT, false, false, false, 0);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+}
+
+
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
+ if (Subtarget->isTargetCygMing()) {
+ TargetLowering::ArgListTy Args;
+ auto &DL = CurDAG->getDataLayout();
+
+ TargetLowering::CallLoweringInfo CLI(*CurDAG);
+ CLI.setChain(CurDAG->getRoot())
+ .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
+ CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
+ std::move(Args), 0);
+ const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+ std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+ CurDAG->setRoot(Result.second);
+ }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+ // If this is main, emit special code for main.
+ if (const Function *Fn = MF->getFunction())
+ if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+ emitSpecialCodeForMain();
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+ // On 64-bit platforms, we can run into an issue where a frame index
+ // includes a displacement that, when added to the explicit displacement,
+ // will overflow the displacement field. Assuming that the frame index
+ // displacement fits into a 31-bit integer (which is only slightly more
+ // aggressive than the current fundamental assumption that it fits into
+ // a 32-bit integer), a 31-bit disp should always be safe.
+ return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
+ X86ISelAddressMode &AM) {
+ // Cannot combine ExternalSymbol displacements with integer offsets.
+ if (Offset != 0 && (AM.ES || AM.MCSym))
+ return true;
+ int64_t Val = AM.Disp + Offset;
+ CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit()) {
+ if (!X86::isOffsetSuitableForCodeModel(Val, M,
+ AM.hasSymbolicDisplacement()))
+ return true;
+ // In addition to the checks required for a register base, check that
+ // we do not try to use an unsafe Disp with a frame index.
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+ !isDispSafeForFrameIndex(Val))
+ return true;
+ }
+ AM.Disp = Val;
+ return false;
+
+}
+
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+ SDValue Address = N->getOperand(1);
+
+ // load gs:0 -> GS segment register.
+ // load fs:0 -> FS segment register.
+ //
+ // This optimization is valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address.
+ // For more information see http://people.redhat.com/drepper/tls.pdf
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+ if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+ Subtarget->isTargetLinux())
+ switch (N->getPointerInfo().getAddrSpace()) {
+ case 256:
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ return false;
+ case 257:
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ return false;
+ }
+
+ return true;
+}
+
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return true;
+
+ SDValue N0 = N.getOperand(0);
+ CodeModel::Model M = TM.getCodeModel();
+
+ // Handle X86-64 rip-relative addresses. We check this before checking direct
+ // folding because RIP is preferable to non-RIP accesses.
+ if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+ // they cannot be folded into immediate fields.
+ // FIXME: This can be improved for kernel and other models?
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ // Base and index reg must be 0 in order to use %rip as base.
+ if (AM.hasBaseOrIndexReg())
+ return true;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ if (foldOffsetIntoAddress(G->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.SymbolFlags = CP->getTargetFlags();
+ if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ if (N.getOpcode() == X86ISD::WrapperRIP)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+ return false;
+ }
+
+ // Handle the case when globals fit in our immediate field: This is true for
+ // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
+ // mode, this only applies to a non-RIP-relative computation.
+ if (!Subtarget->is64Bit() ||
+ M == CodeModel::Small || M == CodeModel::Kernel) {
+ assert(N.getOpcode() != X86ISD::WrapperRIP &&
+ "RIP-relative addressing already handled");
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.Disp += G->getOffset();
+ AM.SymbolFlags = G->getTargetFlags();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += CP->getOffset();
+ AM.SymbolFlags = CP->getTargetFlags();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.Disp += BA->getOffset();
+ AM.SymbolFlags = BA->getTargetFlags();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+ return false;
+ }
+
+ return true;
+}
+
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+ if (matchAddressRecursively(N, AM, 0))
+ return true;
+
+ // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+ // a smaller encoding and avoids a scaled-index.
+ if (AM.Scale == 2 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr) {
+ AM.Base_Reg = AM.IndexReg;
+ AM.Scale = 1;
+ }
+
+ // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+ // because it has a smaller encoding.
+ // TODO: Which other code models can use this?
+ if (TM.getCodeModel() == CodeModel::Small &&
+ Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+ return false;
+}
+
+bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ X86ISelAddressMode Backup = AM;
+ if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // Try again after commuting the operands.
+ if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base_Reg.getNode() &&
+ !AM.IndexReg.getNode()) {
+ N = Handle.getValue();
+ AM.Base_Reg = N.getOperand(0);
+ AM.IndexReg = N.getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ N = Handle.getValue();
+ return true;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+ if (N.getNode()->getNodeId() == -1 ||
+ N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+ DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
+ N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+ }
+}
+
+// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
+// safe. This allows us to convert the shift and and into an h-register
+// extract and a scaled index. Returns false if the simplification is
+// performed.
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ !Shift.hasOneUse())
+ return true;
+
+ int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+ if (ScaleLog <= 0 || ScaleLog >= 4 ||
+ Mask != (0xffu << ScaleLog))
+ return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
+ SDValue NewMask = DAG.getConstant(0xff, DL, VT);
+ SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+ SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, Eight);
+ insertDAGNode(DAG, N, Srl);
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, And);
+ insertDAGNode(DAG, N, ShlCount);
+ insertDAGNode(DAG, N, Shl);
+ DAG.ReplaceAllUsesWith(N, Shl);
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SHL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ return true;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+ return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+ SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, NewAnd);
+ insertDAGNode(DAG, N, NewShift);
+ DAG.ReplaceAllUsesWith(N, NewShift);
+
+ AM.Scale = 1 << ShiftAmt;
+ AM.IndexReg = NewAnd;
+ return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+// int f(short *y, int *lookup_table) {
+// ...
+// return *y + lookup_table[*y >> 11];
+// }
+//
+// Turning into:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $11, %ecx
+// addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $9, %ecx
+// andl $124, %rcx
+// addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ unsigned MaskLZ = countLeadingZeros(Mask);
+ unsigned MaskTZ = countTrailingZeros(Mask);
+
+ // The amount of shift we're trying to fit into the addressing mode is taken
+ // from the trailing zeros of the mask.
+ unsigned AMShiftAmt = MaskTZ;
+
+ // There is nothing we can do here unless the mask is removing some bits.
+ // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+ if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+ // We also need to ensure that mask is a continuous run of bits.
+ if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+ // Scale the leading zero count down based on the actual size of the value.
+ // Also scale it down based on the size of the shift.
+ MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+
+ // The final check is to ensure that any masked out high bits of X are
+ // already known to be zero. Otherwise, the mask has a semantic impact
+ // other than masking out a couple of low bits. Unfortunately, because of
+ // the mask, zero extensions will be removed from operands in some cases.
+ // This code works extra hard to look through extensions because we can
+ // replace them with zero extensions cheaply if necessary.
+ bool ReplacingAnyExtend = false;
+ if (X.getOpcode() == ISD::ANY_EXTEND) {
+ unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+ X.getOperand(0).getSimpleValueType().getSizeInBits();
+ // Assume that we'll replace the any-extend with a zero-extend, and
+ // narrow the search to the extended value.
+ X = X.getOperand(0);
+ MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+ ReplacingAnyExtend = true;
+ }
+ APInt MaskedHighBits =
+ APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
+ APInt KnownZero, KnownOne;
+ DAG.computeKnownBits(X, KnownZero, KnownOne);
+ if (MaskedHighBits != KnownZero) return true;
+
+ // We've identified a pattern that can be transformed into a single shift
+ // and an addressing mode. Make it so.
+ MVT VT = N.getSimpleValueType();
+ if (ReplacingAnyExtend) {
+ assert(X.getValueType() != VT);
+ // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+ SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
+ insertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+ SDLoc DL(N);
+ SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+ SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+ SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+ SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewSRLAmt);
+ insertDAGNode(DAG, N, NewSRL);
+ insertDAGNode(DAG, N, NewSHLAmt);
+ insertDAGNode(DAG, N, NewSHL);
+ DAG.ReplaceAllUsesWith(N, NewSHL);
+
+ AM.Scale = 1 << AMShiftAmt;
+ AM.IndexReg = NewSRL;
+ return false;
+}
+
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ SDLoc dl(N);
+ DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump();
+ });
+ // Limit recursion.
+ if (Depth > 5)
+ return matchAddressBase(N, AM);
+
+ // If this is already a %rip relative address, we can only merge immediates
+ // into it. Instead of handling this in every case, we handle it here.
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRelative()) {
+ // FIXME: JumpTable and ExternalSymbol address currently don't like
+ // displacements. It isn't very important, but this should be fixed for
+ // consistency.
+ if (!(AM.ES || AM.MCSym) && AM.JT != -1)
+ return true;
+
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+ if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
+ return false;
+ return true;
+ }
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::LOCAL_RECOVER: {
+ if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
+ if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
+ // Use the symbol and don't prefix it.
+ AM.MCSym = ESNode->getMCSymbol();
+ return false;
+ }
+ break;
+ }
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!foldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
+
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ if (!matchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::LOAD:
+ if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+ break;
+
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ unsigned Val = CN->getZExtValue();
+ // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+ // that the base operand remains free for further matching. If
+ // the base doesn't end up getting used, a post-processing step
+ // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDValue ShVal = N.getNode()->getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+ AM.IndexReg = ShVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+ if (!foldOffsetIntoAddress(Disp, AM))
+ return false;
+ }
+
+ AM.IndexReg = ShVal;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SRL: {
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+ SDValue And = N.getOperand(0);
+ if (And.getOpcode() != ISD::AND) break;
+ SDValue X = And.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+ // The mask used for the transform is expected to be post-shift, but we
+ // found the shift first so just apply the shift to the mask before passing
+ // it down.
+ if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+ !isa<ConstantSDNode>(And.getOperand(1)))
+ break;
+ uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into the scale, and return false if we
+ // succeed.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+ return false;
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI:
+ // A mul_lohi where we need the low part can be folded as a plain multiply.
+ if (N.getResNo() != 0) break;
+ // FALL THROUGH
+ case ISD::MUL:
+ case X86ISD::MUL_IMM:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr) {
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+ CN->getZExtValue() == 9) {
+ AM.Scale = unsigned(CN->getZExtValue())-1;
+
+ SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+ Reg = MulVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+ if (foldOffsetIntoAddress(Disp, AM))
+ Reg = N.getNode()->getOperand(0);
+ } else {
+ Reg = N.getNode()->getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base_Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SUB: {
+ // Given A-B, if A can be completely folded into the address and
+ // the index field with the index field unused, use -B as the index.
+ // This is a win if a has multiple parts that can be folded into
+ // the address. Also, this saves a mov if the base register has
+ // other uses, since it avoids a two-address sub instruction, however
+ // it costs an additional mov if the index register has other uses.
+
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ // Test if the LHS of the sub can be folded.
+ X86ISelAddressMode Backup = AM;
+ if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+ AM = Backup;
+ break;
+ }
+ // Test if the index field is free for use.
+ if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+ AM = Backup;
+ break;
+ }
+
+ int Cost = 0;
+ SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+ // If the RHS involves a register with multiple uses, this
+ // transformation incurs an extra mov, due to the neg instruction
+ // clobbering its operand.
+ if (!RHS.getNode()->hasOneUse() ||
+ RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+ RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+ RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+ (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ ++Cost;
+ // If the base is a register with multiple uses, this
+ // transformation may save a mov.
+ if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() &&
+ !AM.Base_Reg.getNode()->hasOneUse()) ||
+ AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ --Cost;
+ // If the folded LHS was interesting, this transformation saves
+ // address arithmetic.
+ if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+ ((AM.Disp != 0) && (Backup.Disp == 0)) +
+ (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+ --Cost;
+ // If it doesn't look like it may be an overall win, don't do it.
+ if (Cost >= 0) {
+ AM = Backup;
+ break;
+ }
+
+ // Ok, the transformation is legal and appears profitable. Go for it.
+ SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
+ SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+ AM.IndexReg = Neg;
+ AM.Scale = 1;
+
+ // Insert the new nodes into the topological ordering.
+ insertDAGNode(*CurDAG, N, Zero);
+ insertDAGNode(*CurDAG, N, Neg);
+ return false;
+ }
+
+ case ISD::ADD:
+ if (!matchAdd(N, AM, Depth))
+ return false;
+ break;
+
+ case ISD::OR:
+ // We want to look through a transform in InstCombine and DAGCombiner that
+ // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+ // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+ // An 'lea' can then be used to match the shift (multiply) and add:
+ // and $1, %esi
+ // lea (%rsi, %rdi, 8), %rax
+ if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+ !matchAdd(N, AM, Depth))
+ return false;
+ break;
+
+ case ISD::AND: {
+ // Perform some heroic transforms on an and of a constant-count shift
+ // with a constant to enable use of the scaled offset field.
+
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+ SDValue Shift = N.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+ SDValue X = Shift.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ break;
+ uint64_t Mask = N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into an extract and scale.
+ if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift directly into the scale.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to swap the mask and shift to place shifts which can be done as
+ // a scale on the outside of the mask.
+ if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+ break;
+ }
+ }
+
+ return matchAddressBase(N, AM);
+}
+
+/// Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+ // If so, check to see if the scale index register is set.
+ if (!AM.IndexReg.getNode()) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base_Reg = N;
+ return false;
+}
+
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+
+ MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
+ if (!Mgs)
+ return false;
+ X86ISelAddressMode AM;
+ unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
+ // AddrSpace 256 -> GS, 257 -> FS.
+ if (AddrSpace == 256)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == 257)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+
+ SDLoc DL(N);
+ Base = Mgs->getBasePtr();
+ Index = Mgs->getIndex();
+ unsigned ScalarSize = Mgs->getValue().getValueType().getScalarSizeInBits();
+ Scale = getI8Imm(ScalarSize/8, DL);
+
+ // If Base is 0, the whole address is in index and the Scale is 1
+ if (isa<ConstantSDNode>(Base)) {
+ assert(cast<ConstantSDNode>(Base)->isNullValue() &&
+ "Unexpected base in gather/scatter");
+ Scale = getI8Imm(1, DL);
+ Base = CurDAG->getRegister(0, MVT::i32);
+ }
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ return true;
+}
+
+/// Returns true if it is able to pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched. It
+/// is always a load, store, atomic node, or null. It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ if (Parent &&
+ // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+ // that are not a MemSDNode, and thus don't have proper addrspace info.
+ Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+ Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+ Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+ Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+ Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+ unsigned AddrSpace =
+ cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+ // AddrSpace 256 -> GS, 257 -> FS.
+ if (AddrSpace == 256)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == 257)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ }
+
+ if (matchAddress(N, AM))
+ return false;
+
+ MVT VT = N.getSimpleValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base_Reg.getNode())
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ }
+
+ if (!AM.IndexReg.getNode())
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// Match a scalar SSE load. In particular, we want to match a load whose top
+/// elements are either undef or zeros. The load flavor is derived from the
+/// type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+/// PatternChainNode: this is the matched node that has a chain input and
+/// output.
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+ SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment,
+ SDValue &PatternNodeWithChain) {
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ PatternNodeWithChain = N.getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ PatternNodeWithChain.hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ return true;
+ }
+ }
+
+ // Also handle the case where we explicitly require zeros in the top
+ // elements. This is a vector shuffle from the zero vector.
+ if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+ // Check to see if the top elements are all zeros (or bitcast of zeros).
+ N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N.getOperand(0).getNode()->hasOneUse() &&
+ ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+ N.getOperand(0).getOperand(0).hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+ if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ PatternNodeWithChain = SDValue(LD, 0);
+ return true;
+ }
+ return false;
+}
+
+
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
+ if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t ImmVal = CN->getZExtValue();
+ if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+ return false;
+
+ Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ // In static codegen with small code model, we can get the address of a label
+ // into a register with 'movl'. TableGen has already made sure we're looking
+ // at a label of some kind.
+ assert(N->getOpcode() == X86ISD::Wrapper &&
+ "Unexpected node type for MOV32ri64");
+ N = N.getOperand(0);
+
+ if (N->getOpcode() != ISD::TargetConstantPool &&
+ N->getOpcode() != ISD::TargetJumpTable &&
+ N->getOpcode() != ISD::TargetGlobalAddress &&
+ N->getOpcode() != ISD::TargetExternalSymbol &&
+ N->getOpcode() != ISD::MCSymbol &&
+ N->getOpcode() != ISD::TargetBlockAddress)
+ return false;
+
+ Imm = N;
+ return TM.getCodeModel() == CodeModel::Small;
+}
+
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+ return false;
+
+ SDLoc DL(N);
+ RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+ if (RN && RN->getReg() == 0)
+ Base = CurDAG->getRegister(0, MVT::i64);
+ else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
+ // Base could already be %rip, particularly in the x32 ABI.
+ Base = SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, DL, MVT::i64),
+ Base,
+ CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
+ 0);
+ }
+
+ RN = dyn_cast<RegisterSDNode>(Index);
+ if (RN && RN->getReg() == 0)
+ Index = CurDAG->getRegister(0, MVT::i64);
+ else {
+ assert(Index.getValueType() == MVT::i32 &&
+ "Expect to be extending 32-bit registers for use in LEA");
+ Index = SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, DL, MVT::i64),
+ Index,
+ CurDAG->getTargetConstant(X86::sub_32bit, DL,
+ MVT::i32)),
+ 0);
+ }
+
+ return true;
+}
+
+/// Calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+ // segments.
+ SDValue Copy = AM.Segment;
+ SDValue T = CurDAG->getRegister(0, MVT::i32);
+ AM.Segment = T;
+ if (matchAddress(N, AM))
+ return false;
+ assert (T == AM.Segment);
+ AM.Segment = Copy;
+
+ MVT VT = N.getSimpleValueType();
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase)
+ if (AM.Base_Reg.getNode())
+ Complexity = 1;
+ else
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.getNode())
+ Complexity++;
+ else
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some experimentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.hasSymbolicDisplacement()) {
+ // For X86-64, always use LEA to materialize RIP-relative addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+ Complexity++;
+
+ // If it isn't worth using an LEA, reject it.
+ if (Complexity <= 2)
+ return false;
+
+ getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+ X86ISelAddressMode AM;
+ AM.GV = GA->getGlobal();
+ AM.Disp += GA->getOffset();
+ AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+ AM.SymbolFlags = GA->getTargetFlags();
+
+ if (N.getValueType() == MVT::i32) {
+ AM.Scale = 1;
+ AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+ } else {
+ AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+ }
+
+ getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ !IsProfitableToFold(N, P, P) ||
+ !IsLegalToFold(N, P, P, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+ auto &DL = MF->getDataLayout();
+ return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
+}
+
+/// Atomic opcode table
+///
+enum AtomicOpc {
+ ADD,
+ SUB,
+ INC,
+ DEC,
+ OR,
+ AND,
+ XOR,
+ AtomicOpcEnd
+};
+
+enum AtomicSz {
+ ConstantI8,
+ I8,
+ SextConstantI16,
+ ConstantI16,
+ I16,
+ SextConstantI32,
+ ConstantI32,
+ I32,
+ SextConstantI64,
+ ConstantI64,
+ I64,
+ AtomicSzEnd
+};
+
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+ {
+ X86::LOCK_ADD8mi,
+ X86::LOCK_ADD8mr,
+ X86::LOCK_ADD16mi8,
+ X86::LOCK_ADD16mi,
+ X86::LOCK_ADD16mr,
+ X86::LOCK_ADD32mi8,
+ X86::LOCK_ADD32mi,
+ X86::LOCK_ADD32mr,
+ X86::LOCK_ADD64mi8,
+ X86::LOCK_ADD64mi32,
+ X86::LOCK_ADD64mr,
+ },
+ {
+ X86::LOCK_SUB8mi,
+ X86::LOCK_SUB8mr,
+ X86::LOCK_SUB16mi8,
+ X86::LOCK_SUB16mi,
+ X86::LOCK_SUB16mr,
+ X86::LOCK_SUB32mi8,
+ X86::LOCK_SUB32mi,
+ X86::LOCK_SUB32mr,
+ X86::LOCK_SUB64mi8,
+ X86::LOCK_SUB64mi32,
+ X86::LOCK_SUB64mr,
+ },
+ {
+ 0,
+ X86::LOCK_INC8m,
+ 0,
+ 0,
+ X86::LOCK_INC16m,
+ 0,
+ 0,
+ X86::LOCK_INC32m,
+ 0,
+ 0,
+ X86::LOCK_INC64m,
+ },
+ {
+ 0,
+ X86::LOCK_DEC8m,
+ 0,
+ 0,
+ X86::LOCK_DEC16m,
+ 0,
+ 0,
+ X86::LOCK_DEC32m,
+ 0,
+ 0,
+ X86::LOCK_DEC64m,
+ },
+ {
+ X86::LOCK_OR8mi,
+ X86::LOCK_OR8mr,
+ X86::LOCK_OR16mi8,
+ X86::LOCK_OR16mi,
+ X86::LOCK_OR16mr,
+ X86::LOCK_OR32mi8,
+ X86::LOCK_OR32mi,
+ X86::LOCK_OR32mr,
+ X86::LOCK_OR64mi8,
+ X86::LOCK_OR64mi32,
+ X86::LOCK_OR64mr,
+ },
+ {
+ X86::LOCK_AND8mi,
+ X86::LOCK_AND8mr,
+ X86::LOCK_AND16mi8,
+ X86::LOCK_AND16mi,
+ X86::LOCK_AND16mr,
+ X86::LOCK_AND32mi8,
+ X86::LOCK_AND32mi,
+ X86::LOCK_AND32mr,
+ X86::LOCK_AND64mi8,
+ X86::LOCK_AND64mi32,
+ X86::LOCK_AND64mr,
+ },
+ {
+ X86::LOCK_XOR8mi,
+ X86::LOCK_XOR8mr,
+ X86::LOCK_XOR16mi8,
+ X86::LOCK_XOR16mi,
+ X86::LOCK_XOR16mr,
+ X86::LOCK_XOR32mi8,
+ X86::LOCK_XOR32mi,
+ X86::LOCK_XOR32mr,
+ X86::LOCK_XOR64mi8,
+ X86::LOCK_XOR64mi32,
+ X86::LOCK_XOR64mr,
+ }
+};
+
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+ SDLoc dl,
+ enum AtomicOpc &Op, MVT NVT,
+ SDValue Val,
+ const X86Subtarget *Subtarget) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+ int64_t CNVal = CN->getSExtValue();
+ // Quit if not 32-bit imm.
+ if ((int32_t)CNVal != CNVal)
+ return Val;
+ // Quit if INT32_MIN: it would be negated as it is negative and overflow,
+ // producing an immediate that does not fit in the 32 bits available for
+ // an immediate operand to sub. However, it still fits in 32 bits for the
+ // add (since it is not negated) so we can return target-constant.
+ if (CNVal == INT32_MIN)
+ return CurDAG->getTargetConstant(CNVal, dl, NVT);
+ // For atomic-load-add, we could do some optimizations.
+ if (Op == ADD) {
+ // Translate to INC/DEC if ADD by 1 or -1.
+ if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) {
+ Op = (CNVal == 1) ? INC : DEC;
+ // No more constant operand after being translated into INC/DEC.
+ return SDValue();
+ }
+ // Translate to SUB if ADD by negative value.
+ if (CNVal < 0) {
+ Op = SUB;
+ CNVal = -CNVal;
+ }
+ }
+ return CurDAG->getTargetConstant(CNVal, dl, NVT);
+ }
+
+ // If the value operand is single-used, try to optimize it.
+ if (Op == ADD && Val.hasOneUse()) {
+ // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+ if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+ Op = SUB;
+ return Val.getOperand(1);
+ }
+ // A special case for i16, which needs truncating as, in most cases, it's
+ // promoted to i32. We will translate
+ // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+ if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+ Val.getOperand(0).getOpcode() == ISD::SUB &&
+ X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+ Op = SUB;
+ Val = Val.getOperand(0);
+ return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+ Val.getOperand(1));
+ }
+ }
+
+ return Val;
+}
+
+SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) {
+ if (Node->hasAnyUseOfValue(0))
+ return nullptr;
+
+ SDLoc dl(Node);
+
+ // Optimize common patterns for __sync_or_and_fetch and similar arith
+ // operations where the result is not used. This allows us to use the "lock"
+ // version of the arithmetic instruction.
+ SDValue Chain = Node->getOperand(0);
+ SDValue Ptr = Node->getOperand(1);
+ SDValue Val = Node->getOperand(2);
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
+ return nullptr;
+
+ // Which index into the table.
+ enum AtomicOpc Op;
+ switch (Node->getOpcode()) {
+ default:
+ return nullptr;
+ case ISD::ATOMIC_LOAD_OR:
+ Op = OR;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ Op = AND;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ Op = XOR;
+ break;
+ case ISD::ATOMIC_LOAD_ADD:
+ Op = ADD;
+ break;
+ }
+
+ Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget);
+ bool isUnOp = !Val.getNode();
+ bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
+
+ unsigned Opc = 0;
+ switch (NVT.SimpleTy) {
+ default: return nullptr;
+ case MVT::i8:
+ if (isCN)
+ Opc = AtomicOpcTbl[Op][ConstantI8];
+ else
+ Opc = AtomicOpcTbl[Op][I8];
+ break;
+ case MVT::i16:
+ if (isCN) {
+ if (immSext8(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][SextConstantI16];
+ else
+ Opc = AtomicOpcTbl[Op][ConstantI16];
+ } else
+ Opc = AtomicOpcTbl[Op][I16];
+ break;
+ case MVT::i32:
+ if (isCN) {
+ if (immSext8(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][SextConstantI32];
+ else
+ Opc = AtomicOpcTbl[Op][ConstantI32];
+ } else
+ Opc = AtomicOpcTbl[Op][I32];
+ break;
+ case MVT::i64:
+ if (isCN) {
+ if (immSext8(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][SextConstantI64];
+ else if (i64immSExt32(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][ConstantI64];
+ else
+ llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith");
+ } else
+ Opc = AtomicOpcTbl[Op][I64];
+ break;
+ }
+
+ assert(Opc != 0 && "Invalid arith lock transform!");
+
+ // Building the new node.
+ SDValue Ret;
+ if (isUnOp) {
+ SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain };
+ Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+ } else {
+ SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain };
+ Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+ }
+
+ // Copying the MachineMemOperand.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+ cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+
+ // We need to have two outputs as that is what the original instruction had.
+ // So we add a dummy, undefined output. This is safe as we checked first
+ // that no-one uses our output anyway.
+ SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ dl, NVT), 0);
+ SDValue RetVals[] = { Undef, Ret };
+ return CurDAG->getMergeValues(RetVals, dl).getNode();
+}
+
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// or OF bits to be accurate.
+static bool hasNoSignedComparisonUses(SDNode *N) {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); UI != UE; ++UI) {
+ // Only examine CopyToReg uses.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+ X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(),
+ FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1) continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode()) return false;
+ // Examine the opcode of the user.
+ switch (FlagUI->getMachineOpcode()) {
+ // These comparisons don't treat the most significant bit specially.
+ case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+ case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+ case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+ case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+ case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
+ case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
+ case X86::CMOVA16rr: case X86::CMOVA16rm:
+ case X86::CMOVA32rr: case X86::CMOVA32rm:
+ case X86::CMOVA64rr: case X86::CMOVA64rm:
+ case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+ case X86::CMOVB16rr: case X86::CMOVB16rm:
+ case X86::CMOVB32rr: case X86::CMOVB32rm:
+ case X86::CMOVB64rr: case X86::CMOVB64rm:
+ case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+ case X86::CMOVE16rr: case X86::CMOVE16rm:
+ case X86::CMOVE32rr: case X86::CMOVE32rm:
+ case X86::CMOVE64rr: case X86::CMOVE64rm:
+ case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+ case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+ case X86::CMOVP16rr: case X86::CMOVP16rm:
+ case X86::CMOVP32rr: case X86::CMOVP32rm:
+ case X86::CMOVP64rr: case X86::CMOVP64rm:
+ continue;
+ // Anything else: assume conservatively.
+ default: return false;
+ }
+ }
+ }
+ return true;
+}
+
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; increment or decrement; store} to modify transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+ SDValue StoredVal, SelectionDAG *CurDAG,
+ LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+ // is the value stored the result of a DEC or INC?
+ if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+ // is the stored value result 0 of the load?
+ if (StoredVal.getResNo() != 0) return false;
+
+ // are there other uses of the loaded value than the inc or dec?
+ if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+ // is the store non-extending and non-indexed?
+ if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+ return false;
+
+ SDValue Load = StoredVal->getOperand(0);
+ // Is the stored value a non-extending and non-indexed load?
+ if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+ // Return LoadNode by reference.
+ LoadNode = cast<LoadSDNode>(Load);
+ // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+ EVT LdVT = LoadNode->getMemoryVT();
+ if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+ LdVT != MVT::i8)
+ return false;
+
+ // Is store the only read of the loaded value?
+ if (!Load.hasOneUse())
+ return false;
+
+ // Is the address of the store the same as the load?
+ if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+ LoadNode->getOffset() != StoreNode->getOffset())
+ return false;
+
+ // Check if the chain is produced by the load or is a TokenFactor with
+ // the load output chain as an operand. Return InputChain by reference.
+ SDValue Chain = StoreNode->getChain();
+
+ bool ChainCheck = false;
+ if (Chain == Load.getValue(1)) {
+ ChainCheck = true;
+ InputChain = LoadNode->getChain();
+ } else if (Chain.getOpcode() == ISD::TokenFactor) {
+ SmallVector<SDValue, 4> ChainOps;
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+ SDValue Op = Chain.getOperand(i);
+ if (Op == Load.getValue(1)) {
+ ChainCheck = true;
+ continue;
+ }
+
+ // Make sure using Op as part of the chain would not cause a cycle here.
+ // In theory, we could check whether the chain node is a predecessor of
+ // the load. But that can be very expensive. Instead visit the uses and
+ // make sure they all have smaller node id than the load.
+ int LoadId = LoadNode->getNodeId();
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = UI->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != 0)
+ continue;
+ if (UI->getNodeId() > LoadId)
+ return false;
+ }
+
+ ChainOps.push_back(Op);
+ }
+
+ if (ChainCheck)
+ // Make a new TokenFactor with all the other input chains except
+ // for the load.
+ InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+ MVT::Other, ChainOps);
+ }
+ if (!ChainCheck)
+ return false;
+
+ return true;
+}
+
+/// Get the appropriate X86 opcode for an in-memory increment or decrement.
+/// Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+ if (Opc == X86ISD::DEC) {
+ if (LdVT == MVT::i64) return X86::DEC64m;
+ if (LdVT == MVT::i32) return X86::DEC32m;
+ if (LdVT == MVT::i16) return X86::DEC16m;
+ if (LdVT == MVT::i8) return X86::DEC8m;
+ } else {
+ assert(Opc == X86ISD::INC && "unrecognized opcode");
+ if (LdVT == MVT::i64) return X86::INC64m;
+ if (LdVT == MVT::i32) return X86::INC32m;
+ if (LdVT == MVT::i16) return X86::INC16m;
+ if (LdVT == MVT::i8) return X86::INC8m;
+ }
+ llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// Customized ISel for GATHER operations.
+SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
+ // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+ SDValue Chain = Node->getOperand(0);
+ SDValue VSrc = Node->getOperand(2);
+ SDValue Base = Node->getOperand(3);
+ SDValue VIdx = Node->getOperand(4);
+ SDValue VMask = Node->getOperand(5);
+ ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+ if (!Scale)
+ return nullptr;
+
+ SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+ MVT::Other);
+
+ SDLoc DL(Node);
+
+ // Memory Operands: Base, Scale, Index, Disp, Segment
+ SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+ const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
+ Disp, Segment, VMask, Chain};
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+ // Node has 2 outputs: VDst and MVT::Other.
+ // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+ // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+ // of ResNode.
+ ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+ return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+ MVT NVT = Node->getSimpleValueType(0);
+ unsigned Opc, MOpc;
+ unsigned Opcode = Node->getOpcode();
+ SDLoc dl(Node);
+
+ DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+ if (Node->isMachineOpcode()) {
+ DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ Node->setNodeId(-1);
+ return nullptr; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case ISD::BRIND: {
+ if (Subtarget->isTargetNaCl())
+ // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+ // leave the instruction alone.
+ break;
+ if (Subtarget->isTarget64BitILP32()) {
+ // Converts a 32-bit register to a 64-bit, zero-extended version of
+ // it. This is needed because x86-64 can do many things, but jmp %r32
+ // ain't one of them.
+ const SDValue &Target = Node->getOperand(1);
+ assert(Target.getSimpleValueType() == llvm::MVT::i32);
+ SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+ SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+ Node->getOperand(0), ZextTarget);
+ ReplaceUses(SDValue(Node, 0), Brind);
+ SelectCode(ZextTarget.getNode());
+ SelectCode(Brind.getNode());
+ return nullptr;
+ }
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_avx2_gather_d_pd:
+ case Intrinsic::x86_avx2_gather_d_pd_256:
+ case Intrinsic::x86_avx2_gather_q_pd:
+ case Intrinsic::x86_avx2_gather_q_pd_256:
+ case Intrinsic::x86_avx2_gather_d_ps:
+ case Intrinsic::x86_avx2_gather_d_ps_256:
+ case Intrinsic::x86_avx2_gather_q_ps:
+ case Intrinsic::x86_avx2_gather_q_ps_256:
+ case Intrinsic::x86_avx2_gather_d_q:
+ case Intrinsic::x86_avx2_gather_d_q_256:
+ case Intrinsic::x86_avx2_gather_q_q:
+ case Intrinsic::x86_avx2_gather_q_q_256:
+ case Intrinsic::x86_avx2_gather_d_d:
+ case Intrinsic::x86_avx2_gather_d_d_256:
+ case Intrinsic::x86_avx2_gather_q_d:
+ case Intrinsic::x86_avx2_gather_q_d_256: {
+ if (!Subtarget->hasAVX2())
+ break;
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break;
+ case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+ case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break;
+ case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break;
+ case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break;
+ case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break;
+ case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break;
+ case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
+ case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
+ }
+ SDNode *RetVal = selectGather(Node, Opc);
+ if (RetVal)
+ // We already called ReplaceUses inside SelectGather.
+ return nullptr;
+ break;
+ }
+ }
+ break;
+ }
+ case X86ISD::GlobalBaseReg:
+ return getGlobalBaseReg();
+
+ case X86ISD::SHRUNKBLEND: {
+ // SHRUNKBLEND selects like a regular VSELECT.
+ SDValue VSelect = CurDAG->getNode(
+ ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2));
+ ReplaceUses(SDValue(Node, 0), VSelect);
+ SelectCode(VSelect.getNode());
+ // We already called ReplaceUses.
+ return nullptr;
+ }
+
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_ADD: {
+ SDNode *RetVal = selectAtomicLoadArith(Node, NVT);
+ if (RetVal)
+ return RetVal;
+ break;
+ }
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ // For operations of the form (x << C1) op C2, check if we can use a smaller
+ // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+ break;
+
+ // i8 is unshrinkable, i16 should be promoted to i32.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ break;
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!Cst || !ShlCst)
+ break;
+
+ int64_t Val = Cst->getSExtValue();
+ uint64_t ShlVal = ShlCst->getZExtValue();
+
+ // Make sure that we don't change the operation by removing bits.
+ // This only matters for OR and XOR, AND is unaffected.
+ uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+ if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ break;
+
+ unsigned ShlOp, AddOp, Op;
+ MVT CstVT = NVT;
+
+ // Check the minimum bitwidth for the new constant.
+ // TODO: AND32ri is the same as AND64ri32 with zext imm.
+ // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+ // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+ if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+ CstVT = MVT::i8;
+ else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+ CstVT = MVT::i32;
+
+ // Bail if there is no smaller encoding.
+ if (NVT == CstVT)
+ break;
+
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i32:
+ assert(CstVT == MVT::i8);
+ ShlOp = X86::SHL32ri;
+ AddOp = X86::ADD32rr;
+
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case ISD::AND: Op = X86::AND32ri8; break;
+ case ISD::OR: Op = X86::OR32ri8; break;
+ case ISD::XOR: Op = X86::XOR32ri8; break;
+ }
+ break;
+ case MVT::i64:
+ assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+ ShlOp = X86::SHL64ri;
+ AddOp = X86::ADD64rr;
+
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+ case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
+ case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+ }
+ break;
+ }
+
+ // Emit the smaller op and the shift.
+ SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
+ SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+ if (ShlVal == 1)
+ return CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
+ SDValue(New, 0));
+ return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+ getI8Imm(ShlVal, dl));
+ }
+ case X86ISD::UMUL8:
+ case X86ISD::SMUL8: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
+ N0, SDValue()).getValue(1);
+
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
+ SDValue Ops[] = {N1, InFlag};
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+ return nullptr;
+ }
+
+ case X86ISD::UMUL: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ unsigned LoReg;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break;
+ case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
+ case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+ case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+ SDValue Ops[] = {N1, InFlag};
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
+ return nullptr;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SMUL_LOHI;
+ bool hasBMI2 = Subtarget->hasBMI2();
+ if (!isSigned) {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
+ case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+ case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+ MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+ case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+ MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+ }
+ } else {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
+ case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+ case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+ case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+ }
+ }
+
+ unsigned SrcReg, LoReg, HiReg;
+ switch (Opc) {
+ default: llvm_unreachable("Unknown MUL opcode!");
+ case X86::IMUL8r:
+ case X86::MUL8r:
+ SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+ break;
+ case X86::IMUL16r:
+ case X86::MUL16r:
+ SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+ break;
+ case X86::IMUL32r:
+ case X86::MUL32r:
+ SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+ break;
+ case X86::IMUL64r:
+ case X86::MUL64r:
+ SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+ break;
+ case X86::MULX32rr:
+ SrcReg = X86::EDX; LoReg = HiReg = 0;
+ break;
+ case X86::MULX64rr:
+ SrcReg = X86::RDX; LoReg = HiReg = 0;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // Multiply is commmutative.
+ if (!foldedLoad) {
+ foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (foldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+ N0, SDValue()).getValue(1);
+ SDValue ResHi, ResLo;
+
+ if (foldedLoad) {
+ SDValue Chain;
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ Chain = SDValue(CNode, 2);
+ InFlag = SDValue(CNode, 3);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ Chain = SDValue(CNode, 0);
+ InFlag = SDValue(CNode, 1);
+ }
+
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), Chain);
+ } else {
+ SDValue Ops[] = { N1, InFlag };
+ if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ InFlag = SDValue(CNode, 2);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 0);
+ }
+ }
+
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ if (HiReg == X86::AH && Subtarget->is64Bit() &&
+ !SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ // Get the low part if needed. Don't use getCopyFromReg for aliasing
+ // registers.
+ if (!SDValue(Node, 0).use_empty())
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+ // Shift AX down 8 bits.
+ Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, dl, MVT::i8)),
+ 0);
+ // Then truncate it down to i8.
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+ }
+ // Copy the low half of the result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ if (!ResLo.getNode()) {
+ assert(LoReg && "Register for low half is not defined!");
+ ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+ InFlag);
+ InFlag = ResLo.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 0), ResLo);
+ DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the high half of the result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ if (!ResHi.getNode()) {
+ assert(HiReg && "Register for high half is not defined!");
+ ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+ InFlag);
+ InFlag = ResHi.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 1), ResHi);
+ DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+
+ return nullptr;
+ }
+
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ case X86ISD::SDIVREM8_SEXT_HREG:
+ case X86ISD::UDIVREM8_ZEXT_HREG: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = (Opcode == ISD::SDIVREM ||
+ Opcode == X86ISD::SDIVREM8_SEXT_HREG);
+ if (!isSigned) {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ } else {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+ }
+
+ unsigned LoReg, HiReg, ClrReg;
+ unsigned SExtOpcode;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; ClrReg = HiReg = X86::AH;
+ SExtOpcode = X86::CBW;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrReg = X86::DX;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+ SDValue InFlag;
+ if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+ MVT::Other, Ops), 0);
+ Chain = Move.getValue(1);
+ ReplaceUses(N0.getValue(1), Chain);
+ } else {
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+ InFlag = Chain.getValue(1);
+ } else {
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+ LoReg, N0, SDValue()).getValue(1);
+ if (isSigned && !signBitIsZero) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+ switch (NVT.SimpleTy) {
+ case MVT::i16:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+ CurDAG->getTargetConstant(X86::sub_16bit, dl,
+ MVT::i32)),
+ 0);
+ break;
+ case MVT::i32:
+ break;
+ case MVT::i64:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl,
+ MVT::i32)),
+ 0);
+ break;
+ default:
+ llvm_unreachable("Unexpected division source");
+ }
+
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+ ClrNode, InFlag).getValue(1);
+ }
+ }
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+ }
+
+ // Prevent use of AH in a REX instruction by explicitly copying it to
+ // an ABCD_L register.
+ //
+ // The current assumption of the register allocator is that isel
+ // won't generate explicit references to the GR8_ABCD_H registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+ SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+ unsigned AHExtOpcode =
+ isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+
+ SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+ MVT::Glue, AHCopy, InFlag);
+ SDValue Result(RNode, 0);
+ InFlag = SDValue(RNode, 1);
+
+ if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
+ Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
+ if (Node->getValueType(1) == MVT::i64) {
+ // It's not possible to directly movsx AH to a 64bit register, because
+ // the latter needs the REX prefix, but the former can't have it.
+ assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
+ "Unexpected i64 sext of h-register");
+ Result =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), Result,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl,
+ MVT::i32)),
+ 0);
+ }
+ } else {
+ Result =
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+ }
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the division (low) result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 0), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the remainder (high) result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ return nullptr;
+ }
+
+ case X86ISD::CMP:
+ case X86ISD::SUB: {
+ // Sometimes a SUB is used to perform comparison.
+ if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+ // This node is not a CMP.
+ break;
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+ hasNoSignedComparisonUses(Node))
+ N0 = N0.getOperand(0);
+
+ // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+ // use a smaller encoding.
+ // Look past the truncate if CMP is the only use of it.
+ if ((N0.getNode()->getOpcode() == ISD::AND ||
+ (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+ N0.getNode()->hasOneUse() &&
+ N0.getValueType() != MVT::i8 &&
+ X86::isZeroNode(N1)) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+ if (!C) break;
+
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+ (!(C->getZExtValue() & 0x80) ||
+ hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // On x86-32, only the ABCD registers have 8-bit subregisters.
+ if (!Subtarget->is64Bit()) {
+ const TargetRegisterClass *TRC;
+ switch (N0.getSimpleValueType().SimpleTy) {
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+ }
+
+ // Extract the l-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return nullptr;
+ }
+
+ // For example, "testl %eax, $2048" to "testb %ah, $8".
+ if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ hasNoSignedComparisonUses(Node))) {
+ // Shift the immediate right by 8 bits.
+ SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+ dl, MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Put the value in an ABCD register.
+ const TargetRegisterClass *TRC;
+ switch (N0.getSimpleValueType().SimpleTy) {
+ case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+
+ // Extract the h-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
+ // target GR8_NOREX registers, so make sure the register class is
+ // forced.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+ MVT::i32, Subreg, ShiftedImm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return nullptr;
+ }
+
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+ N0.getValueType() != MVT::i16 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+ MVT::i16);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 16-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+ MVT::i16, Reg);
+
+ // Emit a testw.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return nullptr;
+ }
+
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+ N0.getValueType() == MVT::i64 &&
+ (!(C->getZExtValue() & 0x80000000) ||
+ hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+ MVT::i32);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 32-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+ MVT::i32, Reg);
+
+ // Emit a testl.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return nullptr;
+ }
+ }
+ break;
+ }
+ case ISD::STORE: {
+ // Change a chain of {load; incr or dec; store} of the same value into
+ // a simple increment or decrement through memory of that value, if the
+ // uses of the modified value and its address are suitable.
+ // The DEC64m tablegen pattern is currently not able to match the case where
+ // the EFLAGS on the original DEC are used. (This also applies to
+ // {INC,DEC}X{64,32,16,8}.)
+ // We'll need to improve tablegen to allow flags to be transferred from a
+ // node in the pattern to the result node. probably with a new keyword
+ // for example, we have this
+ // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ // (implicit EFLAGS)]>;
+ // but maybe need something like this
+ // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ // (transferrable EFLAGS)]>;
+
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+ SDValue StoredVal = StoreNode->getOperand(1);
+ unsigned Opc = StoredVal->getOpcode();
+
+ LoadSDNode *LoadNode = nullptr;
+ SDValue InputChain;
+ if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+ LoadNode, InputChain))
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+ MemOp[0] = StoreNode->getMemOperand();
+ MemOp[1] = LoadNode->getMemOperand();
+ const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+ EVT LdVT = LoadNode->getMemoryVT();
+ unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+ MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+ SDLoc(Node),
+ MVT::i32, MVT::Other, Ops);
+ Result->setMemRefs(MemOp, MemOp + 2);
+
+ ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+
+ return Result;
+ }
+ }
+
+ SDNode *ResNode = SelectCode(Node);
+
+ DEBUG(dbgs() << "=> ";
+ if (ResNode == nullptr || ResNode == Node)
+ Node->dump(CurDAG);
+ else
+ ResNode->dump(CurDAG);
+ dbgs() << '\n');
+
+ return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, Op2, Op3, Op4;
+ switch (ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_i:
+ // FIXME: It seems strange that 'i' is needed here since it's supposed to
+ // be an immediate and not a memory constraint.
+ // Fallthrough.
+ case InlineAsm::Constraint_o: // offsetable ??
+ case InlineAsm::Constraint_v: // not offsetable ??
+ case InlineAsm::Constraint_m: // memory
+ case InlineAsm::Constraint_X:
+ if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ OutOps.push_back(Op4);
+ return false;
+}
+
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..d31aab0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,28765 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "X86IntrinsicsInfo.h"
+#include <bitset>
+#include <numeric>
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+ "x86-experimental-vector-widening-legalization", cl::init(false),
+ cl::desc("Enable an experimental vector type legalization through widening "
+ "rather than promotion."),
+ cl::Hidden);
+
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+ // Set up the TargetLowering object.
+
+ // X86 is weird. It always uses i8 for shift amounts and setcc results.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // For 64-bit, since we have so many registers, use the ILP scheduler.
+ // For 32-bit, use the register pressure specific scheduling.
+ // For Atom, always use ILP scheduling.
+ if (Subtarget->isAtom())
+ setSchedulingPreference(Sched::ILP);
+ else if (Subtarget->is64Bit())
+ setSchedulingPreference(Sched::ILP);
+ else
+ setSchedulingPreference(Sched::RegPressure);
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+ // Bypass expensive divides on Atom when compiling with O2.
+ if (TM.getOptLevel() >= CodeGenOpt::Default) {
+ if (Subtarget->hasSlowDivide32())
+ addBypassSlowDiv(32, 8);
+ if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
+ addBypassSlowDiv(64, 16);
+ }
+
+ if (Subtarget->isTargetKnownWindowsMSVC()) {
+ // Setup Windows compiler runtime calls.
+ setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+ setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+ setLibcallName(RTLIB::SREM_I64, "_allrem");
+ setLibcallName(RTLIB::UREM_I64, "_aullrem");
+ setLibcallName(RTLIB::MUL_I64, "_allmul");
+ setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+ }
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(false);
+ setUseUnderscoreLongJmp(false);
+ } else if (Subtarget->isTargetWindowsGNU()) {
+ // MS runtime is weird: it exports _setjmp, but longjmp!
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(false);
+ } else {
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+ }
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &X86::GR8RegClass);
+ addRegisterClass(MVT::i16, &X86::GR16RegClass);
+ addRegisterClass(MVT::i32, &X86::GR32RegClass);
+ if (Subtarget->is64Bit())
+ addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // SETOEQ and SETUNE require checking two conditions.
+ setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+ // f32/f64 are legal, f80 is custom.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ else
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ } else if (!Subtarget->useSoftFloat()) {
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ // We have an algorithm for SSE2, and we turn this into a 64-bit
+ // FILD or VCVTUSI2SS/SD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ }
+
+ // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
+
+ if (!Subtarget->useSoftFloat()) {
+ // SSE has no i16 to fp conversion, only i32
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
+ }
+
+ // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
+
+ if (!Subtarget->useSoftFloat()) {
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
+ }
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+ // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ }
+ } else if (!Subtarget->useSoftFloat()) {
+ // Since AVX is a superset of SSE3, only check for SSE here.
+ if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+ // Expand FP_TO_UINT into a select.
+ // FIXME: We would like to use a Custom expander here eventually to do
+ // the optimal thing for SSE vs. the default expansion in the legalizer.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
+ else
+ // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
+ // With SSE3 we can use fisttpll to convert to a signed i64; without
+ // SSE, we're stuck with a fistpll.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ }
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSEf64) {
+ setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
+ setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
+ // Without SSE, i64->f64 goes through memory.
+ setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
+ }
+ }
+
+ // Scalar integer divide and remainder are lowered to use operations that
+ // produce two results, to match the available instructions. This exposes
+ // the two-result form to trivial CSE, which is able to combine x/y and x%y
+ // into a single instruction.
+ //
+ // Scalar integer multiply-high is also lowered to use two-result
+ // operations, to match the available instructions. However, plain multiply
+ // (low) operations are left as Legal, as there are single-result
+ // instructions for this in x86. Using the two-result multiply instructions
+ // when both high and low results are needed must be arranged by dagcombine.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+
+ // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+ setOperationAction(ISD::ADDC, VT, Custom);
+ setOperationAction(ISD::ADDE, VT, Custom);
+ setOperationAction(ISD::SUBC, VT, Custom);
+ setOperationAction(ISD::SUBE, VT, Custom);
+ }
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ setOperationAction(ISD::BR_CC , MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f80, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f128, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i8, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i16, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f128, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
+
+ if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
+ // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+ // is. We should promote the value to 64-bits to solve this.
+ // This is what the CRT headers do - `fmodf` is an inline header
+ // function casting to f64 and calling `fmod`.
+ setOperationAction(ISD::FREM , MVT::f32 , Promote);
+ } else {
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ }
+
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+ setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ // Promote the i8 variants and force them on up to i32 which has a shorter
+ // encoding.
+ setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
+ if (Subtarget->hasBMI()) {
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+ } else {
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasLZCNT()) {
+ // When promoting the i8 variants, force them to i32 for a shorter
+ // encoding.
+ setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+ } else {
+ setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ }
+ }
+
+ // Special handling for half-precision floating point conversions.
+ // If we don't have F16C support, then lower half float conversions
+ // into library calls.
+ if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ }
+
+ // There's never any support for operations beyond MVT::f32.
+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
+ if (Subtarget->hasPOPCNT()) {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
+ } else {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+
+ if (!Subtarget->hasMOVBE())
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // These should be promoted to a larger select which is supported.
+ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
+ // X86 wants to expand cmov itself.
+ setOperationAction(ISD::SELECT , MVT::i8 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i16 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f64 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f80 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f128 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f80 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f128 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SELECT , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i64 , Custom);
+ }
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+ // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+ // support continuation, user-level threading, and etc.. As a result, no
+ // other SjLj exception interfaces are implemented and please don't build
+ // your own exception handling based on them.
+ // LLVM/Clang supports zero-cost DWARF exception handling.
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+ // Darwin ABI issue.
+ setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
+ setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
+ setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
+ }
+ // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+ setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasSSE1())
+ setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
+
+ setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
+
+ // Expand certain atomics
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+ }
+
+ if (Subtarget->hasCmpxchg16b()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
+ }
+
+ // FIXME - use subtarget debug flags
+ if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
+ !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
+ setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::VAARG , MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY , MVT::Other, Custom);
+ } else {
+ // TargetInfo::CharPtrBuiltinVaList
+ setOperationAction(ISD::VAARG , MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+ // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
+ setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
+ setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
+
+ if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
+ // f32 and f64 use SSE.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::FR64RegClass);
+
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f64, Custom);
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f64, Custom);
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // Lower this to FGETSIGNx86 plus an AND.
+ setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+ setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+ // Expand FP immediates into loads from the stack, except for the special
+ // cases we handle.
+ addLegalFPImmediate(APFloat(+0.0)); // xorpd
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
+ // Use SSE for f32, x87 for f64.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+ // Use ANDPS to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+
+ // Use ANDPS and ORPS to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+ // Special cases we handle for FP constants.
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ }
+ } else if (!Subtarget->useSoftFloat()) {
+ // f32 and f64 in x87.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+ addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+ setOperationAction(ISD::UNDEF, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ }
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+ addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+ }
+
+ // We don't support FMA.
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+ // Long double always uses X87, except f128 in MMX.
+ if (!Subtarget->useSoftFloat()) {
+ if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::f128, &X86::FR128RegClass);
+ ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+ setOperationAction(ISD::FABS , MVT::f128, Custom);
+ setOperationAction(ISD::FNEG , MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+ }
+
+ addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+ {
+ APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
+ addLegalFPImmediate(TmpFlt); // FLD0
+ TmpFlt.changeSign();
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+
+ bool ignored;
+ APFloat TmpFlt2(+1.0);
+ TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.changeSign();
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ }
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f80, Expand);
+ setOperationAction(ISD::FCOS , MVT::f80, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+ }
+
+ setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f80, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+ setOperationAction(ISD::FRINT, MVT::f80, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+ setOperationAction(ISD::FMA, MVT::f80, Expand);
+ }
+
+ // Always use a library call for pow.
+ setOperationAction(ISD::FPOW , MVT::f32 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f64 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+
+ setOperationAction(ISD::FLOG, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
+
+ // First set operation action for all vector types to either promote
+ // (for widening) or expand (for scalarization). Then we will selectively
+ // turn on ones that can be effectively codegen'd.
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::ADD , VT, Expand);
+ setOperationAction(ISD::SUB , VT, Expand);
+ setOperationAction(ISD::FADD, VT, Expand);
+ setOperationAction(ISD::FNEG, VT, Expand);
+ setOperationAction(ISD::FSUB, VT, Expand);
+ setOperationAction(ISD::MUL , VT, Expand);
+ setOperationAction(ISD::FMUL, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::LOAD, VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::FABS, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+ setOperationAction(ISD::SHL, VT, Expand);
+ setOperationAction(ISD::SRA, VT, Expand);
+ setOperationAction(ISD::SRL, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+ setOperationAction(ISD::TRUNCATE, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+ setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(InnerVT, VT, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
+
+ // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+ // types, we have to deal with them whether we ask for Expansion or not.
+ // Setting Expand causes its own optimisation problems though, so leave
+ // them legal.
+ if (VT.getVectorElementType() == MVT::i1)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+ // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
+ // split/scalarized right now.
+ if (VT.getVectorElementType() == MVT::f16)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+ }
+ }
+
+ // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+ // with -msoft-float, disable use of MMX as well.
+ if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+ // No operations on x86mmx supported, everything uses intrinsics.
+ }
+
+ // MMX-sized vectors (other than x86mmx) are expected to be expanded
+ // into smaller operations.
+ for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
+ setOperationAction(ISD::MULHS, MMXTy, Expand);
+ setOperationAction(ISD::AND, MMXTy, Expand);
+ setOperationAction(ISD::OR, MMXTy, Expand);
+ setOperationAction(ISD::XOR, MMXTy, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand);
+ setOperationAction(ISD::SELECT, MMXTy, Expand);
+ setOperationAction(ISD::BITCAST, MMXTy, Expand);
+ }
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
+ addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
+
+ setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ }
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
+ addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
+
+ // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
+ // registers cannot be used even for integer operations.
+ addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
+
+ setOperationAction(ISD::ADD, MVT::v16i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
+
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+ // ISD::CTTZ v2i64 - scalarization is faster.
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ // We support custom legalizing of sext and anyext loads for specific
+ // memory vector types which we can load as a scalar (or sequence of
+ // scalars) and extend in-register to a legal 128-bit vector type. For sext
+ // loads these must work with a single scalar load.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+ }
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ }
+
+ // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v2i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v2i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+ }
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ // As there is no 64-bit GPR available, we need build a special custom
+ // sequence to convert from v2i32 to v2f32.
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
+
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
+ }
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
+ for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ }
+
+ setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+
+ // FIXME: Do we need to handle scalar-to-vector here?
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+
+ // We directly match byte blends in the backend as they match the VSELECT
+ // condition form.
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+
+ // SSE41 brings specific instructions for doing vector sign extend even in
+ // cases where we don't have SRA.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+ }
+
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ // i8 and i16 vectors are custom because the source register and source
+ // source memory operand types are not the same width. f32 vectors are
+ // custom since the immediate controlling the insert encodes additional
+ // information.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ // FIXME: these should be Legal, but that's only for the case where
+ // the index is constant. For now custom expand to deal with that.
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ }
+ }
+
+ if (Subtarget->hasSSE2()) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+ setOperationAction(ISD::SRL, MVT::v8i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v8i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i8, Custom);
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ setOperationAction(ISD::SRL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i32, Custom);
+ }
+
+ if (Subtarget->hasXOP()) {
+ setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v2i64, Custom);
+ setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
+ setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v4i64, Custom);
+ }
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
+ addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
+
+ setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
+
+ setOperationAction(ISD::FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f64, Custom);
+
+ // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+ // even though v8i16 is a legal type.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
+
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::SRL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::CTPOP, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+
+ if (Subtarget->hasAnyFMA()) {
+ setOperationAction(ISD::FMA, MVT::v8f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMA, MVT::f32, Legal);
+ setOperationAction(ISD::FMA, MVT::f64, Legal);
+ }
+
+ if (Subtarget->hasInt256()) {
+ setOperationAction(ISD::ADD, MVT::v4i64, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v16i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v32i8, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v32i8, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v16i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
+
+ setOperationAction(ISD::SMAX, MVT::v32i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v16i16, Legal);
+ setOperationAction(ISD::SMAX, MVT::v8i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v32i8, Legal);
+ setOperationAction(ISD::UMAX, MVT::v16i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v32i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i16, Legal);
+ setOperationAction(ISD::SMIN, MVT::v8i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v32i8, Legal);
+ setOperationAction(ISD::UMIN, MVT::v16i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i32, Legal);
+
+ // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+ // when we have a 256bit-wide blend with immediate.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+ } else {
+ setOperationAction(ISD::ADD, MVT::v4i64, Custom);
+ setOperationAction(ISD::ADD, MVT::v8i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v16i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v8i32, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v32i8, Custom);
+ setOperationAction(ISD::SMAX, MVT::v16i16, Custom);
+ setOperationAction(ISD::SMAX, MVT::v8i32, Custom);
+ setOperationAction(ISD::UMAX, MVT::v32i8, Custom);
+ setOperationAction(ISD::UMAX, MVT::v16i16, Custom);
+ setOperationAction(ISD::UMAX, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMIN, MVT::v32i8, Custom);
+ setOperationAction(ISD::SMIN, MVT::v16i16, Custom);
+ setOperationAction(ISD::SMIN, MVT::v8i32, Custom);
+ setOperationAction(ISD::UMIN, MVT::v32i8, Custom);
+ setOperationAction(ISD::UMIN, MVT::v16i16, Custom);
+ setOperationAction(ISD::UMIN, MVT::v8i32, Custom);
+ }
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ setOperationAction(ISD::SRL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v8i32, Custom);
+
+ // Custom lower several nodes for 256-bit types.
+ for (MVT VT : MVT::vector_valuetypes()) {
+ if (VT.getScalarSizeInBits() >= 32) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+ // Extract subvector is special because the value type
+ // (result) is 128-bit but the source is 256-bit wide.
+ if (VT.is128BitVector()) {
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ }
+ // Do not attempt to custom lower other non-256-bit vectors
+ if (!VT.is256BitVector())
+ continue;
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ }
+
+ if (Subtarget->hasInt256())
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+ // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v4i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v4i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+ }
+ }
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+ addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+
+ addRegisterClass(MVT::i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+ addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SETCC, MVT::i1, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::XOR, MVT::i1, Legal);
+ setOperationAction(ISD::OR, MVT::i1, Legal);
+ setOperationAction(ISD::AND, MVT::i1, Legal);
+ setOperationAction(ISD::SUB, MVT::i1, Custom);
+ setOperationAction(ISD::ADD, MVT::i1, Custom);
+ setOperationAction(ISD::MUL, MVT::i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
+
+ setOperationAction(ISD::FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v16f32, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v8f64, Custom);
+ setOperationAction(ISD::FMA, MVT::v8f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v16f32, Legal);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+
+ setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ if (Subtarget->hasVLX()){
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+ } else {
+ setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
+ setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
+ setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
+ }
+ setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
+ if (Subtarget->hasDQI()) {
+ setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+ }
+ }
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ }
+ setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ if (Subtarget->hasDQI()) {
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
+ }
+ setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
+ setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
+ setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
+ setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
+
+ setOperationAction(ISD::ADD, MVT::v8i64, Legal);
+ setOperationAction(ISD::ADD, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v8i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::SRL, MVT::v8i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v8i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::AND, MVT::v8i64, Legal);
+ setOperationAction(ISD::OR, MVT::v8i64, Legal);
+ setOperationAction(ISD::XOR, MVT::v8i64, Legal);
+ setOperationAction(ISD::AND, MVT::v16i32, Legal);
+ setOperationAction(ISD::OR, MVT::v16i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v16i32, Legal);
+
+ if (Subtarget->hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand);
+
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
+
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ } else {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand);
+ }
+ } // Subtarget->hasCDI()
+
+ if (Subtarget->hasDQI()) {
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
+ }
+ // Custom lower several nodes.
+ for (MVT VT : MVT::vector_valuetypes()) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ if (EltSize == 1) {
+ setOperationAction(ISD::AND, VT, Legal);
+ setOperationAction(ISD::OR, VT, Legal);
+ setOperationAction(ISD::XOR, VT, Legal);
+ }
+ if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+ // Extract subvector is special because the value type
+ // (result) is 256/128-bit but the source is 512-bit wide.
+ if (VT.is128BitVector() || VT.is256BitVector()) {
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ }
+ if (VT.getVectorElementType() == MVT::i1)
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+ // Do not attempt to custom lower other non-512-bit vectors
+ if (!VT.is512BitVector())
+ continue;
+
+ if (EltSize >= 32) {
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+ }
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+ }
+ }// has AVX-512
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
+
+ addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
+ addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
+
+ setOperationAction(ISD::LOAD, MVT::v32i16, Legal);
+ setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
+ setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v64i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v32i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v64i8, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
+ setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
+ setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
+ setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
+ setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
+
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+ if (Subtarget->hasVLX())
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+
+ if (Subtarget->hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand);
+ }
+
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v8i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v8i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
+ }
+ }
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
+ addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
+ addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+
+ setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
+
+ setOperationAction(ISD::AND, MVT::v8i32, Legal);
+ setOperationAction(ISD::OR, MVT::v8i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v8i32, Legal);
+ setOperationAction(ISD::AND, MVT::v4i32, Legal);
+ setOperationAction(ISD::OR, MVT::v4i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v4i32, Legal);
+ setOperationAction(ISD::SRA, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v2i64, Legal);
+ setOperationAction(ISD::SMAX, MVT::v4i64, Legal);
+ setOperationAction(ISD::UMAX, MVT::v2i64, Legal);
+ setOperationAction(ISD::UMAX, MVT::v4i64, Legal);
+ setOperationAction(ISD::SMIN, MVT::v2i64, Legal);
+ setOperationAction(ISD::SMIN, MVT::v4i64, Legal);
+ setOperationAction(ISD::UMIN, MVT::v2i64, Legal);
+ setOperationAction(ISD::UMIN, MVT::v4i64, Legal);
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (!Subtarget->is64Bit()) {
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ }
+
+ // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+ // handle type legalization for these operations here.
+ //
+ // FIXME: We really should do custom legalization for addition and
+ // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
+ // than generic legalization for 64-bit multiplication-with-overflow, though.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget->is64Bit())
+ continue;
+ // Add/Sub/Mul with overflow operations are custom lowered.
+ setOperationAction(ISD::SADDO, VT, Custom);
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::SSUBO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
+ setOperationAction(ISD::SMULO, VT, Custom);
+ setOperationAction(ISD::UMULO, VT, Custom);
+ }
+
+ if (!Subtarget->is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ }
+
+ // Combine sin / cos into one node or libcall if possible.
+ if (Subtarget->hasSinCos()) {
+ setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+ setLibcallName(RTLIB::SINCOS_F64, "sincos");
+ if (Subtarget->isTargetDarwin()) {
+ // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+ // We want to issue a libcall to __sincos_stret to avoid memory traffic.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ }
+ }
+
+ if (Subtarget->isTargetWin64()) {
+ setOperationAction(ISD::SDIV, MVT::i128, Custom);
+ setOperationAction(ISD::UDIV, MVT::i128, Custom);
+ setOperationAction(ISD::SREM, MVT::i128, Custom);
+ setOperationAction(ISD::UREM, MVT::i128, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+ }
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
+ setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::FMINNUM);
+ setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
+ setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::MGATHER);
+
+ computeRegisterProperties(Subtarget->getRegisterInfo());
+
+ MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = 4;
+ setPrefLoopAlignment(4); // 2^4 bytes.
+
+ // A predictable cmov does not hurt on an in-order CPU.
+ // FIXME: Use a CPU attribute to trigger this, not a CPU model.
+ PredictableSelectIsExpensive = !Subtarget->isAtom();
+ EnableExtLdPromotion = true;
+ setPrefFunctionAlignment(4); // 2^4 bytes.
+
+ verifyIntrinsicTables();
+}
+
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+ return Subtarget->isTargetMachO() && Subtarget->is64Bit();
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (ExperimentalVectorWideningLegalization &&
+ VT.getVectorNumElements() != 1 &&
+ VT.getVectorElementType().getSimpleVT() != MVT::i1)
+ return TypeWidenVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+ EVT VT) const {
+ if (!VT.isVector())
+ return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
+
+ if (VT.isSimple()) {
+ MVT VVT = VT.getSimpleVT();
+ const unsigned NumElts = VVT.getVectorNumElements();
+ const MVT EltVT = VVT.getVectorElementType();
+ if (VVT.is512BitVector()) {
+ if (Subtarget->hasAVX512())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ }
+ if (Subtarget->hasBWI())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 32: return MVT::v32i1;
+ case 64: return MVT::v64i1;
+ }
+ }
+
+ if (VVT.is256BitVector() || VVT.is128BitVector()) {
+ if (Subtarget->hasVLX())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 2: return MVT::v2i1;
+ case 4: return MVT::v4i1;
+ case 8: return MVT::v8i1;
+ }
+ if (Subtarget->hasBWI() && Subtarget->hasVLX())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ case 32: return MVT::v32i1;
+ }
+ }
+ }
+
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+ if (MaxAlign == 16)
+ return;
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VTy->getBitWidth() == 128)
+ MaxAlign = 16;
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(ATy->getElementType(), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (auto *EltTy : STy->elements()) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(EltTy, EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == 16)
+ break;
+ }
+ }
+}
+
+/// Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const {
+ if (Subtarget->is64Bit()) {
+ // Max of 8 and alignment of type.
+ unsigned TyAlign = DL.getABITypeAlignment(Ty);
+ if (TyAlign > 8)
+ return TyAlign;
+ return 8;
+ }
+
+ unsigned Align = 4;
+ if (Subtarget->hasSSE1())
+ getMaxByValAlign(Ty, Align);
+ return Align;
+}
+
+/// Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ if ((!IsMemset || ZeroMemset) &&
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (Size >= 16 &&
+ (!Subtarget->isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
+ if (Size >= 32) {
+ // FIXME: Check if unaligned 32-byte accesses are slow.
+ if (Subtarget->hasInt256())
+ return MVT::v8i32;
+ if (Subtarget->hasFp256())
+ return MVT::v8f32;
+ }
+ if (Subtarget->hasSSE2())
+ return MVT::v4i32;
+ if (Subtarget->hasSSE1())
+ return MVT::v4f32;
+ } else if (!MemcpyStrSrc && Size >= 8 &&
+ !Subtarget->is64Bit() &&
+ Subtarget->hasSSE2()) {
+ // Do not use f64 to lower memcpy if source is string constant. It's
+ // better to use i32 to avoid the loads.
+ return MVT::f64;
+ }
+ }
+ // This is a compromise. If we reach here, unaligned accesses may be slow on
+ // this target. However, creating smaller, aligned accesses could be even
+ // slower and would certainly be a lot more code.
+ if (Subtarget->is64Bit() && Size >= 8)
+ return MVT::i64;
+ return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+ if (VT == MVT::f32)
+ return X86ScalarSSEf32;
+ else if (VT == MVT::f64)
+ return X86ScalarSSEf64;
+ return true;
+}
+
+bool
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
+ if (Fast) {
+ switch (VT.getSizeInBits()) {
+ default:
+ // 8-byte and under are always assumed to be fast.
+ *Fast = true;
+ break;
+ case 128:
+ *Fast = !Subtarget->isUnalignedMem16Slow();
+ break;
+ case 256:
+ *Fast = !Subtarget->isUnalignedMem32Slow();
+ break;
+ // TODO: What about AVX-512 (512-bit) accesses?
+ }
+ }
+ // Misaligned accesses of any size are always allowed.
+ return true;
+}
+
+/// Return the entry encoding for a jump table in the
+/// current function. The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+ // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+ // symbol.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ // Otherwise, use the normal jump table encoding heuristics.
+ return TargetLowering::getJumpTableEncoding();
+}
+
+bool X86TargetLowering::useSoftFloat() const {
+ return Subtarget->useSoftFloat();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,MCContext &Ctx) const{
+ assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT());
+ // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+ // entries.
+ return MCSymbolRefExpr::create(MBB->getSymbol(),
+ MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (!Subtarget->is64Bit())
+ // This doesn't have SDLoc associated with it, but is not really the
+ // same as a Register.
+ return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()));
+ return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+ MCContext &Ctx) const {
+ // X86-64 uses RIP relative addressing based on the jump table label.
+ if (Subtarget->isPICStyleRIPRel())
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+ // Otherwise, the reference is relative to the PIC base.
+ return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+}
+
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
+ const TargetRegisterClass *RRC = nullptr;
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(TRI, VT);
+ case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+ RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+ break;
+ case MVT::x86mmx:
+ RRC = &X86::VR64RegClass;
+ break;
+ case MVT::f32: case MVT::f64:
+ case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+ case MVT::v4f32: case MVT::v2f64:
+ case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
+ case MVT::v4f64:
+ RRC = &X86::VR128RegClass;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+ unsigned &Offset) const {
+ if (!Subtarget->isTargetLinux())
+ return false;
+
+ if (Subtarget->is64Bit()) {
+ // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+ Offset = 0x28;
+ if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+ AddressSpace = 256;
+ else
+ AddressSpace = 257;
+ } else {
+ // %gs:0x14 on i386
+ Offset = 0x14;
+ AddressSpace = 256;
+ }
+ return true;
+}
+
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ unsigned AddressSpace, Offset;
+ if (Subtarget->is64Bit()) {
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ Offset = 0x48;
+ if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+ AddressSpace = 256;
+ else
+ AddressSpace = 257;
+ } else {
+ // %gs:0x24 on i386
+ Offset = 0x24;
+ AddressSpace = 256;
+ }
+
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ assert(SrcAS != DestAS && "Expected different address spaces!");
+
+ return SrcAS < 256 && DestAS < 256;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool X86TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+ return ScratchRegs;
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+ report_fatal_error("X86 interrupts may not return any value");
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+ MVT::i16));
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue ValToCopy = OutVals[i];
+ EVT ValVT = ValToCopy.getValueType();
+
+ // Promote values to the appropriate types.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt) {
+ if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ }
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
+
+ assert(VA.getLocInfo() != CCValAssign::FPExt &&
+ "Unexpected FP-extend for return value.");
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values,
+ // or SSE or MMX vectors.
+ if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+ VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+ (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+ // Likewise we can't return F64 values with SSE1 only. gcc does so, but
+ // llvm-gcc has never done it right and no one has noticed, so this
+ // should be OK for now.
+ if (ValVT == MVT::f64 &&
+ (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+ report_fatal_error("SSE2 register return with SSE2 disabled");
+
+ // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+ // the RET instruction and handled by the FP Stackifier.
+ if (VA.getLocReg() == X86::FP0 ||
+ VA.getLocReg() == X86::FP1) {
+ // If this is a copy from an xmm register to ST(0), use an FPExtend to
+ // change the value to the FP stack register class.
+ if (isScalarFPTypeInSSEReg(VA.getValVT()))
+ ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+ RetOps.push_back(ValToCopy);
+ // Don't emit a copytoreg.
+ continue;
+ }
+
+ // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+ // which is returned in RAX / RDX.
+ if (Subtarget->is64Bit()) {
+ if (ValVT == MVT::x86mmx) {
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+ ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
+ ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ ValToCopy);
+ // If we don't have SSE2 available, convert to v4f32 so the generated
+ // register is legal.
+ if (!Subtarget->hasSSE2())
+ ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
+ }
+ }
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+ getPointerTy(MF.getDataLayout()));
+
+ unsigned RetValReg
+ = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+ X86::RAX : X86::EAX;
+ Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ // RAX/EAX now acts like a return value.
+ RetOps.push_back(
+ DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+ if (CallConv == CallingConv::X86_INTR)
+ opcode = X86ISD::IRET;
+ return DAG.getNode(opcode, dl, MVT::Other, RetOps);
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+ if (N->getNumValues() != 1)
+ return false;
+ if (!N->hasNUsesOfValue(1, 0))
+ return false;
+
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+ return false;
+
+ bool HasRet = false;
+ for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() != X86ISD::RET_FLAG)
+ return false;
+ // If we are returning more than one value, we can definitely
+ // not make a tail call see PR19530
+ if (UI->getNumOperands() > 4)
+ return false;
+ if (UI->getNumOperands() == 4 &&
+ UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+ return false;
+ HasRet = true;
+ }
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
+ return true;
+}
+
+EVT
+X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const {
+ MVT ReturnMVT;
+ // TODO: Is this also valid on 32-bit?
+ if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+ ReturnMVT = MVT::i8;
+ else
+ ReturnMVT = MVT::i32;
+
+ EVT MinVT = getRegisterType(Context, ReturnMVT);
+ return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ bool Is64Bit = Subtarget->is64Bit();
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getLocVT();
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ bool RoundAfterCopy = false;
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ RoundAfterCopy = (CopyVT != VA.getLocVT());
+ }
+
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ CopyVT, InFlag).getValue(1);
+ SDValue Val = Chain.getValue(0);
+
+ if (RoundAfterCopy)
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1, dl));
+
+ if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+ NotStructReturn,
+ RegStructReturn,
+ StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
+ if (Outs.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg() || IsMCU)
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Determines whether a function uses struct return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
+ if (Ins.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg() || IsMCU)
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+ ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ /*isVolatile*/false, /*AlwaysInline=*/true,
+ /*isTailCall*/false,
+ MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::HiPE || CC == CallingConv::HHVM);
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ // C calling conventions:
+ case CallingConv::C:
+ case CallingConv::X86_64_Win64:
+ case CallingConv::X86_64_SysV:
+ // Callee pop conventions:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::X86_FastCall:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+ return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ auto Attr =
+ CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+ if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ return false;
+
+ CallSite CS(CI);
+ CallingConv::ID CalleeCC = CS.getCallingConv();
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ return true;
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+ CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SDLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo *MFI,
+ unsigned i) const {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ bool AlwaysUseMutable = shouldGuaranteeTCO(
+ CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+ bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+ EVT ValVT;
+
+ // If value is passed by pointer we have address passed instead of the value
+ // itself.
+ bool ExtendedInMem = VA.isExtInLoc() &&
+ VA.getValVT().getScalarType() == MVT::i1;
+
+ if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
+ ValVT = VA.getLocVT();
+ else
+ ValVT = VA.getValVT();
+
+ // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+ // taken by a return address.
+ int Offset = 0;
+ if (CallConv == CallingConv::X86_INTR) {
+ const X86Subtarget& Subtarget =
+ static_cast<const X86Subtarget&>(DAG.getSubtarget());
+ // X86 interrupts may take one or two arguments.
+ // On the stack there will be no return address as in regular call.
+ // Offset of last argument need to be set to -4/-8 bytes.
+ // Where offset of the first argument out of two, should be set to 0 bytes.
+ Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+ }
+
+ // FIXME: For now, all byval parameter objects are marked mutable. This can be
+ // changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable. Since they
+ // could be overwritten by lowering of arguments in case of a tail call.
+ if (Flags.isByVal()) {
+ unsigned Bytes = Flags.getByValSize();
+ if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+ int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI->setObjectOffset(FI, Offset);
+ }
+ return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ } else {
+ int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+ VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI->setObjectOffset(FI, Offset);
+ }
+
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+ SDValue Val = DAG.getLoad(
+ ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+ false, false, 0);
+ return ExtendedInMem ?
+ DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+ }
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+ const X86Subtarget *Subtarget) {
+ assert(Subtarget->is64Bit());
+
+ if (Subtarget->isCallingConvWin64(CallConv)) {
+ static const MCPhysReg GPR64ArgRegsWin64[] = {
+ X86::RCX, X86::RDX, X86::R8, X86::R9
+ };
+ return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+ }
+
+ static const MCPhysReg GPR64ArgRegs64Bit[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+ CallingConv::ID CallConv,
+ const X86Subtarget *Subtarget) {
+ assert(Subtarget->is64Bit());
+ if (Subtarget->isCallingConvWin64(CallConv)) {
+ // The XMM registers which might contain var arg parameters are shadowed
+ // in their paired GPR. So we only need to save the GPR to their home
+ // slots.
+ // TODO: __vectorcall will change this.
+ return None;
+ }
+
+ const Function *Fn = MF.getFunction();
+ bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
+ bool isSoftFloat = Subtarget->useSoftFloat();
+ assert(!(isSoftFloat && NoImplicitFloatOps) &&
+ "SSE register cannot be used when SSE is disabled!");
+ if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
+ // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+ // registers.
+ return None;
+
+ static const MCPhysReg XMMArgRegs64Bit[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
+}
+
+SDValue X86TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+
+ const Function* Fn = MF.getFunction();
+ if (Fn->hasExternalLinkage() &&
+ Subtarget->isTargetCygMing() &&
+ Fn->getName() == "main")
+ FuncInfo->setForceFramePointer(true);
+
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
+
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling convention fastcc, ghc or hipe");
+
+ if (CallConv == CallingConv::X86_INTR) {
+ bool isLegal = Ins.size() == 1 ||
+ (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+ (!Is64Bit && Ins[1].VT == MVT::i32)));
+ if (!isLegal)
+ report_fatal_error("X86 interrupts may take one or two arguments");
+ }
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+
+ unsigned LastVal = ~0U;
+ SDValue ArgValue;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ (void)LastVal;
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = &X86::GR32RegClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = &X86::GR64RegClass;
+ else if (RegVT == MVT::f32)
+ RC = &X86::FR32RegClass;
+ else if (RegVT == MVT::f64)
+ RC = &X86::FR64RegClass;
+ else if (RegVT == MVT::f128)
+ RC = &X86::FR128RegClass;
+ else if (RegVT.is512BitVector())
+ RC = &X86::VR512RegClass;
+ else if (RegVT.is256BitVector())
+ RC = &X86::VR256RegClass;
+ else if (RegVT.is128BitVector())
+ RC = &X86::VR128RegClass;
+ else if (RegVT == MVT::x86mmx)
+ RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::i1)
+ RC = &X86::VK1RegClass;
+ else if (RegVT == MVT::v8i1)
+ RC = &X86::VK8RegClass;
+ else if (RegVT == MVT::v16i1)
+ RC = &X86::VK16RegClass;
+ else if (RegVT == MVT::v32i1)
+ RC = &X86::VK32RegClass;
+ else if (RegVT == MVT::v64i1)
+ RC = &X86::VK64RegClass;
+ else
+ llvm_unreachable("Unknown argument type!");
+
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
+
+ if (VA.isExtInLoc()) {
+ // Handle MMX values passed in XMM regs.
+ if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
+ ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+ else
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ }
+ } else {
+ assert(VA.isMemLoc());
+ ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+ }
+
+ // If value is passed via pointer - do a load.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
+ MachinePointerInfo(), false, false, false, 0);
+
+ InVals.push_back(ArgValue);
+ }
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // All x86 ABIs require that for returning structs by value we copy the
+ // sret argument into %rax/%eax (depending on ABI) for the return. Save
+ // the argument into a virtual register so that we can access it from the
+ // return points.
+ if (Ins[i].Flags.isSRet()) {
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy(DAG.getDataLayout());
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ break;
+ }
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // Align stack specially for tail calls.
+ if (shouldGuaranteeTCO(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start. We
+ // can skip this if there are no va_start calls.
+ if (MFI->hasVAStart() &&
+ (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall))) {
+ FuncInfo->setVarArgsFrameIndex(
+ MFI->CreateFixedObject(1, StackSize, true));
+ }
+
+ // Figure out if XMM registers are in use.
+ assert(!(Subtarget->useSoftFloat() &&
+ Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // 64-bit calling conventions support varargs and register parameters, so we
+ // have to do extra work to spill them in the prologue.
+ if (Is64Bit && isVarArg && MFI->hasVAStart()) {
+ // Find the first unallocated argument registers.
+ ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+ ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+ assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // Gather all the live in physical registers.
+ SmallVector<SDValue, 6> LiveGPRs;
+ SmallVector<SDValue, 8> LiveXMMRegs;
+ SDValue ALVal;
+ for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+ unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
+ LiveGPRs.push_back(
+ DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
+ }
+ if (!ArgXMMs.empty()) {
+ unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
+ for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
+ unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
+ LiveXMMRegs.push_back(
+ DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
+ }
+ }
+
+ if (IsWin64) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
+ MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+ } else {
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by deferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy(DAG.getDataLayout()));
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ RSFIN, DAG.getIntPtrConstant(Offset, dl));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ FuncInfo->getRegSaveFrameIndex(), Offset),
+ false, false, 0);
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex(), dl));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset(), dl));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ }
+
+ if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget->hasAVX512() &&
+ (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget->hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget->hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Conservatively forward AL on x86_64, since it might be used for varargs.
+ if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &F : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+ }
+ }
+
+ // Some CCs need callee pop.
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+ // X86 interrupts must pop the error code if present
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+ } else {
+ FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+ !Subtarget->getTargetTriple().isOSMSVCRT() &&
+ argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn)
+ FuncInfo->setBytesToPopOnReturn(4);
+ }
+
+ if (!Is64Bit) {
+ // RegSaveFrameIndex is X86-64 only.
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+ if (CallConv == CallingConv::X86_FastCall ||
+ CallConv == CallingConv::X86_ThisCall)
+ // fastcc functions can't have varargs.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ }
+
+ FuncInfo->setArgumentStackSize(StackSize);
+
+ if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+ EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ assert(Is64Bit);
+ // TODO: Add a mechanism to frame lowering that will allow us to indicate
+ // that we'd prefer this slot be allocated towards the bottom of the frame
+ // (i.e. near the stack pointer after allocating the frame). Every
+ // funclet needs a copy of this slot in its (mostly empty) frame, and the
+ // offset from the bottom of this and each funclet's frame must be the
+ // same, so the size of funclets' (mostly empty) frames is dictated by
+ // how far this slot is from the bottom (since they allocate just enough
+ // space to accomodate holding this slot at the correct offset).
+ int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+ EHInfo->PSPSymFrameIdx = PSPSymFI;
+ }
+ }
+
+ return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+ SDValue StackPtr, SDValue Arg,
+ SDLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ if (Flags.isByVal())
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+ return DAG.getStore(
+ Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+ false, false, 0);
+}
+
+/// Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+ SDValue &OutRetAddr, SDValue Chain,
+ bool IsTailCall, bool Is64Bit,
+ int FPDiff, SDLoc dl) const {
+ // Adjust the Return address stack slot.
+ EVT VT = getPointerTy(DAG.getDataLayout());
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
+ false, false, false, 0);
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, SDLoc dl) {
+ // Store the return address to the appropriate stack slot.
+ if (!FPDiff) return Chain;
+ // Calculate the new stack slot for the return address.
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+ false);
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+ Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), NewReturnAddrFI),
+ false, false, 0);
+ return Chain;
+}
+
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &isTailCall = CLI.IsTailCall;
+ bool isVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
+ StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU());
+ bool IsSibcall = false;
+ X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+ auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+
+ if (CallConv == CallingConv::X86_INTR)
+ report_fatal_error("X86 interrupts may not be called directly");
+
+ if (Attr.getValueAsString() == "true")
+ isTailCall = false;
+
+ if (Subtarget->isPICStyleGOT() &&
+ !MF.getTarget().Options.GuaranteedTailCallOpt) {
+ // If we are using a GOT, disable tail calls to external symbols with
+ // default visibility. Tail calling such a symbol requires using a GOT
+ // relocation, which forces early binding of the symbol. This breaks code
+ // that require lazy function symbol resolution. Using musttail or
+ // GuaranteedTailCallOpt will override this.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility()))
+ isTailCall = false;
+ }
+
+ bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
+ // Check if it's really possible to do a tail call.
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ isVarArg, SR != NotStructReturn,
+ MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+ Outs, OutVals, Ins, DAG);
+
+ // Sibcalls are automatically detected tailcalls which do not require
+ // ABI changes.
+ if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+ IsSibcall = true;
+
+ if (isTailCall)
+ ++NumTailCalls;
+ }
+
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling convention fastcc, ghc or hipe");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+ if (IsSibcall)
+ // This is a sibcall. The memory operands are available in caller's
+ // own caller's stack.
+ NumBytes = 0;
+ else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ canGuaranteeTCO(CallConv))
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+ int FPDiff = 0;
+ if (isTailCall && !IsSibcall && !IsMustTail) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < X86Info->getTCReturnAddrDelta())
+ X86Info->setTCReturnAddrDelta(FPDiff);
+ }
+
+ unsigned NumBytesToPush = NumBytes;
+ unsigned NumBytesToPop = NumBytes;
+
+ // If we have an inalloca argument, all stack space has already been allocated
+ // for us and be right at the top of the stack. We don't support multiple
+ // arguments passed in memory when using inalloca.
+ if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+ NumBytesToPush = 0;
+ if (!ArgLocs.back().isMemLoc())
+ report_fatal_error("cannot use inalloca attribute on a register "
+ "parameter");
+ if (ArgLocs.back().getLocMemOffset() != 0)
+ report_fatal_error("any parameter with the inalloca attribute must be "
+ "the only memory argument");
+ }
+
+ if (!IsSibcall)
+ Chain = DAG.getCALLSEQ_START(
+ Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+
+ SDValue RetAddrFrIdx;
+ // Load return address for tail calls.
+ if (isTailCall && FPDiff)
+ Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+ Is64Bit, FPDiff, dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // Skip inalloca arguments, they have already been written.
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (Flags.isInAlloca())
+ continue;
+
+ CCValAssign &VA = ArgLocs[i];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[i];
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ if (Arg.getValueType().isVector() &&
+ Arg.getValueType().getVectorElementType() == MVT::i1)
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ else if (RegVT.is128BitVector()) {
+ // Special case: passing MMX values in XMM registers.
+ Arg = DAG.getBitcast(MVT::i64, Arg);
+ Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+ Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+ } else
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getBitcast(RegVT, Arg);
+ break;
+ case CCValAssign::Indirect: {
+ // Store the argument.
+ SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ Chain = DAG.getStore(
+ Chain, dl, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, 0);
+ Arg = SpillSlot;
+ break;
+ }
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (isVarArg && IsWin64) {
+ // Win64 ABI requires argument XMM reg to be copied to the corresponding
+ // shadow reg if callee is a varargs function.
+ unsigned ShadowReg = 0;
+ switch (VA.getLocReg()) {
+ case X86::XMM0: ShadowReg = X86::RCX; break;
+ case X86::XMM1: ShadowReg = X86::RDX; break;
+ case X86::XMM2: ShadowReg = X86::R8; break;
+ case X86::XMM3: ShadowReg = X86::R9; break;
+ }
+ if (ShadowReg)
+ RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+ }
+ } else if (!IsSibcall && (!isTailCall || isByVal)) {
+ assert(VA.isMemLoc());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ if (Subtarget->isPICStyleGOT()) {
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (!isTailCall) {
+ RegsToPass.push_back(std::make_pair(
+ unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()))));
+ } else {
+ // If we are tail calling and generating PIC/GOT style code load the
+ // address of the callee into ECX. The value in ecx is used as target of
+ // the tail jump. This is done to circumvent the ebx/callee-saved problem
+ // for tail calls on PIC/GOT architectures. Normally we would just put the
+ // address of GOT into ebx and then call target@PLT. But for tail calls
+ // ebx would be restored (since ebx is callee saved) before jumping to the
+ // target@PLT.
+
+ // Note: The actual moving to ECX is done further down.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (G && !G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility())
+ Callee = LowerGlobalAddress(Callee, DAG);
+ else if (isa<ExternalSymbolSDNode>(Callee))
+ Callee = LowerExternalSymbol(Callee, DAG);
+ }
+ }
+
+ if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+
+ RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ DAG.getConstant(NumXMMRegs, dl,
+ MVT::i8)));
+ }
+
+ if (isVarArg && IsMustTail) {
+ const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+ for (const auto &F : Forwards) {
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ }
+ }
+
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(DAG.getStore(
+ ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, 0));
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+ getPointerTy(DAG.getDataLayout()),
+ RegInfo->getSlotSize(), FPDiff, dl);
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
+ assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+ // In the 64-bit large code model, we have to make all calls
+ // through a register, since the call instruction's 32-bit
+ // pc-relative offset may not be large enough to hold the whole
+ // address.
+ } else if (Callee->getOpcode() == ISD::GlobalAddress) {
+ // If the callee is a GlobalAddress node (quite common, every direct call
+ // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+ // it.
+ GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
+
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ const GlobalValue *GV = G->getGlobal();
+ if (!GV->hasDLLImportStorageClass()) {
+ unsigned char OpFlags = 0;
+ bool ExtraLoad = false;
+ unsigned WrapperKind = ISD::DELETED_NODE;
+
+ // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+ // external symbols most go through the PLT in PIC mode. If the symbol
+ // has hidden or protected visibility, or if it is static or local, then
+ // we don't need to use the PLT - we can directly call it.
+ if (Subtarget->isTargetELF() &&
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+ GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ !GV->isStrongDefinitionForLinker() &&
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
+ cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
+ // If the function is marked as non-lazy, generate an indirect call
+ // which loads from the GOT directly. This avoids runtime overhead
+ // at the cost of eager binding (and one extra byte of encoding).
+ OpFlags = X86II::MO_GOTPCREL;
+ WrapperKind = X86ISD::WrapperRIP;
+ ExtraLoad = true;
+ }
+
+ Callee = DAG.getTargetGlobalAddress(
+ GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
+
+ // Add a wrapper if needed.
+ if (WrapperKind != ISD::DELETED_NODE)
+ Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+ getPointerTy(DAG.getDataLayout()), Callee);
+ // Add extra indirection if needed.
+ if (ExtraLoad)
+ Callee = DAG.getLoad(
+ getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
+ false, 0);
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
+ // external symbols should go through the PLT.
+ if (Subtarget->isTargetELF() &&
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+ Callee = DAG.getTargetExternalSymbol(
+ S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+ } else if (Subtarget->isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
+ // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+ Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+
+ if (!IsSibcall && isTailCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (isTailCall)
+ Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+
+ // If this is an invoke in a 32-bit function using a funclet-based
+ // personality, assume the function clobbers all registers. If an exception
+ // is thrown, the runtime will not restore CSRs.
+ // FIXME: Model this more precisely so that we can register allocate across
+ // the normal edge and spill and fill across the exceptional edge.
+ if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+ const Function *CallerFn = MF.getFunction();
+ EHPersonality Pers =
+ CallerFn->hasPersonalityFn()
+ ? classifyEHPersonality(CallerFn->getPersonalityFn())
+ : EHPersonality::Unknown;
+ if (isFuncletEHPersonality(Pers))
+ Mask = RegInfo->getNoPreservedMask();
+ }
+
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (isTailCall) {
+ // We used to do:
+ //// If this is the first return lowered for this function, add the regs
+ //// to the liveout set for the function.
+ // This isn't right, although it's probably harmless on x86; liveouts
+ // should be computed from returns not tail calls. Consider a void
+ // function making a tail call to a function returning int.
+ MF.getFrameInfo()->setHasTailCall();
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ }
+
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPop;
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ DAG.getTarget().Options.GuaranteedTailCallOpt))
+ NumBytesForCalleeToPop = NumBytes; // Callee pops everything
+ else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+ !Subtarget->getTargetTriple().isOSMSVCRT() &&
+ SR == StackStructReturn)
+ // If this is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+ NumBytesForCalleeToPop = 4;
+ else
+ NumBytesForCalleeToPop = 0; // Callee pops nothing.
+
+ // Returns a flag for retval copy to use.
+ if (!IsSibcall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
+ true),
+ InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+ Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On X86_64 architecture with GOT-style position independent code only local
+// (within module) calls are supported at the moment.
+// To keep the stack aligned according to platform abi the function
+// GetAlignedArgumentStackSize ensures that argument delta is always multiples
+// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achieved by reserving an area the size of the argument delta right after the
+// original RETADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) const {
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = RegInfo->getSlotSize();
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ return Offset;
+}
+
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+ MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+ const X86InstrInfo *TII) {
+ unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(VR))
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(Def, FI))
+ return false;
+ } else {
+ unsigned Opcode = Def->getOpcode();
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
+ Def->getOperand(1).isFI()) {
+ FI = Def->getOperand(1).getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+ }
+ } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+ FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+ FI = FINode->getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI->isFixedObjectIndex(FI))
+ return false;
+ return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ // If -tailcallopt is specified, make fastcc functions tail-callable.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = MF.getFunction();
+
+ // If the function return type is x86_fp80 and the callee return type is not,
+ // then the FP_EXTEND of the call result is not a nop. It's not safe to
+ // perform a tailcall optimization here.
+ if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+ return false;
+
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+ bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+ bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
+ if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes. This is what gcc calls sibcall.
+
+ // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+ // emit a special epilogue.
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // Do not sibcall optimize vararg calls unless all arguments are passed via
+ // registers.
+ if (isVarArg && !Outs.empty()) {
+ // Optimizing for varargs on Win64 is unlikely to be safe without
+ // additional testing.
+ if (IsCalleeWin64 || IsCallerWin64)
+ return false;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
+ // If the call result is in ST0 / ST1, it needs to be popped off the x87
+ // stack. Therefore, if it's not used by the call it is not safe to optimize
+ // this into a sibcall.
+ bool Unused = false;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ if (!Ins[i].Used) {
+ Unused = true;
+ break;
+ }
+ }
+ if (Unused) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+ return false;
+ }
+ }
+
+ // If the calling conventions do not match, then we'd better make sure the
+ // results are returned in the same way as what the caller expects.
+ if (!CCMatch) {
+ SmallVector<CCValAssign, 16> RVLocs1;
+ CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+ *DAG.getContext());
+ CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
+
+ SmallVector<CCValAssign, 16> RVLocs2;
+ CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+ *DAG.getContext());
+ CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
+
+ if (RVLocs1.size() != RVLocs2.size())
+ return false;
+ for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+ if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+ return false;
+ if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+ return false;
+ if (RVLocs1[i].isRegLoc()) {
+ if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+ return false;
+ } else {
+ if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+ return false;
+ }
+ }
+ }
+
+ unsigned StackArgsSize = 0;
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
+
+ // Allocate shadow area for Win64
+ if (IsCalleeWin64)
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ StackArgsSize = CCInfo.getNextStackOffset();
+
+ if (CCInfo.getNextStackOffset()) {
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+ MFI, MRI, TII))
+ return false;
+ }
+ }
+ }
+
+ // If the tailcall address may be in a register, then make sure it's
+ // possible to register allocate for it. In 32-bit, the call address can
+ // only target EAX, EDX, or ECX since the tail call must be scheduled after
+ // callee-saved registers are restored. These happen to be the same
+ // registers used to pass 'inreg' arguments so watch out for those.
+ if (!Subtarget->is64Bit() &&
+ ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+ unsigned NumInRegs = 0;
+ // In PIC we need an extra register to formulate the address computation
+ // for the callee.
+ unsigned MaxInRegs =
+ (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc())
+ continue;
+ unsigned Reg = VA.getLocReg();
+ switch (Reg) {
+ default: break;
+ case X86::EAX: case X86::EDX: case X86::ECX:
+ if (++NumInRegs == MaxInRegs)
+ return false;
+ break;
+ }
+ }
+ }
+ }
+
+ bool CalleeWillPop =
+ X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt);
+
+ if (unsigned BytesToPop =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ // If we have bytes to pop, the callee must pop them.
+ bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+ if (!CalleePopMatches)
+ return false;
+ } else if (CalleeWillPop && StackArgsSize > 0) {
+ // If we don't have bytes to pop, make sure the callee doesn't pop any.
+ return false;
+ }
+
+ return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+ switch(Opcode) {
+ default: return false;
+ case X86ISD::BLENDI:
+ case X86ISD::PSHUFB:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
+ case X86ISD::PALIGNR:
+ case X86ISD::MOVLHPS:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERM2X128:
+ case X86ISD::VPERMI:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ return true;
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+ SDValue V1, unsigned TargetMask,
+ SelectionDAG &DAG) {
+ switch(Opc) {
+ default: llvm_unreachable("Unknown x86 shuffle node");
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERMI:
+ return DAG.getNode(Opc, dl, VT, V1,
+ DAG.getConstant(TargetMask, dl, MVT::i8));
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ switch(Opc) {
+ default: llvm_unreachable("Unknown x86 shuffle node");
+ case X86ISD::MOVLHPS:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ return DAG.getNode(Opc, dl, VT, V1, V2);
+ }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+ -(int64_t)SlotSize,
+ false);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement) {
+ // Offset should fit into 32 bit immediate field.
+ if (!isInt<32>(Offset))
+ return false;
+
+ // If we don't have a symbolic displacement - we don't have any extra
+ // restrictions.
+ if (!hasSymbolicDisplacement)
+ return true;
+
+ // FIXME: Some tweaks might be needed for medium code model.
+ if (M != CodeModel::Small && M != CodeModel::Kernel)
+ return false;
+
+ // For small code model we assume that latest object is 16MB before end of 31
+ // bits boundary. We may also accept pretty large negative constants knowing
+ // that all objects are in the positive half of address space.
+ if (M == CodeModel::Small && Offset < 16*1024*1024)
+ return true;
+
+ // For kernel code model we know that all object resist in the negative half
+ // of 32bits address space. We may not accept negative offsets, since they may
+ // be just off and we may accept pretty large positive ones.
+ if (M == CodeModel::Kernel && Offset >= 0)
+ return true;
+
+ return false;
+}
+
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+ // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+ // can guarantee TCO.
+ if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+ return true;
+
+ switch (CallingConv) {
+ default:
+ return false;
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
+ return !is64Bit;
+ }
+}
+
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+ switch (X86CC) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case X86::COND_E: return true;
+ case X86::COND_G: return false;
+ case X86::COND_GE: return false;
+ case X86::COND_L: return false;
+ case X86::COND_LE: return false;
+ case X86::COND_NE: return true;
+ case X86::COND_B: return true;
+ case X86::COND_A: return true;
+ case X86::COND_BE: return true;
+ case X86::COND_AE: return true;
+ }
+}
+
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
+ SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return X86::COND_S;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return X86::COND_LE;
+ }
+ }
+
+ return TranslateIntegerX86CC(SetCCOpcode);
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+ !ISD::isNON_EXTLoad(RHS.getNode())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT: return X86::COND_A;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE: return X86::COND_AE;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT: return X86::COND_B;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE: return X86::COND_BE;
+ case ISD::SETONE:
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETUO: return X86::COND_P;
+ case ISD::SETO: return X86::COND_NP;
+ case ISD::SETOEQ:
+ case ISD::SETUNE: return X86::COND_INVALID;
+ }
+}
+
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+/// Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+ if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+ return true;
+ }
+ return false;
+}
+
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+ // relocation target a movq or addq instruction: don't let the load shrink.
+ SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+ if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+ return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+ return true;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0 || BitSize > 64)
+ return false;
+ return true;
+}
+
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget->hasLZCNT();
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+ if (0 <= Mask[i])
+ return false;
+ return true;
+}
+
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+ return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// Val is either less than zero (undef) or equal to the specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ return (Val < 0 || Val == CmpVal);
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+ unsigned Pos, unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ if (!isUndefOrEqual(Mask[i], Low))
+ return false;
+ return true;
+}
+
+/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
+/// extract that is suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+ if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+ return false;
+
+ // The index should be aligned on a vecWidth-bit boundary.
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+ bool Result = (Index * ElSize) % vecWidth == 0;
+
+ return Result;
+}
+
+/// Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+ if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+ return false;
+ // The index should be aligned on a vecWidth-bit boundary.
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+ bool Result = (Index * ElSize) % vecWidth == 0;
+
+ return Result;
+}
+
+bool X86::isVINSERT128Index(SDNode *N) {
+ return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+ return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 256);
+}
+
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+ assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+ "Illegal extract subvector for VEXTRACT");
+
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+ MVT VecVT = N->getOperand(0).getSimpleValueType();
+ MVT ElVT = VecVT.getVectorElementType();
+
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+ return Index / NumElemsPerChunk;
+}
+
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+ assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+ "Illegal insert subvector for VINSERT");
+
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+ MVT VecVT = N->getSimpleValueType(0);
+ MVT ElVT = VecVT.getVectorElementType();
+
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+ return Index / NumElemsPerChunk;
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 256);
+}
+
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+ return isNullConstant(Elt) || isNullFPConstant(Elt);
+}
+
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
+ SelectionDAG &DAG,
+ SDLoc dl, bool IsMask = false) {
+
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsUndef = Values[i] < 0 && IsMask;
+ SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(Values[i], dl, EltVT);
+ Ops.push_back(OpNode);
+ if (Split)
+ Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(0, dl, EltVT));
+ }
+ SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+ if (Split)
+ ConstsNode = DAG.getBitcast(VT, ConstsNode);
+ return ConstsNode;
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ // Always build SSE zero vectors as <4 x i32> bitcasted
+ // to their dest type. This ensures they get CSE'd.
+ SDValue Vec;
+ if (VT.is128BitVector()) { // SSE
+ if (Subtarget->hasSSE2()) { // SSE2
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ } else { // SSE1
+ SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+ }
+ } else if (VT.is256BitVector()) { // AVX
+ if (Subtarget->hasInt256()) { // AVX2
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
+ } else {
+ // 256-bit logic and arithmetic instructions in AVX are all
+ // floating-point, no support for integer ops. Emit fp zeroed vectors.
+ SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
+ }
+ } else if (VT.is512BitVector()) { // AVX-512
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+ } else if (VT.getVectorElementType() == MVT::i1) {
+
+ assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
+ && "Unexpected vector type");
+ assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
+ && "Unexpected vector type");
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
+ SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+ } else
+ llvm_unreachable("Unexpected vector type");
+
+ return DAG.getBitcast(VT, Vec);
+}
+
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract from UNDEF is UNDEF.
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getUNDEF(ResultVT);
+
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl, unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+ // For insertion into the zero index (low half) of a 256-bit vector, it is
+ // more efficient to generate a blend with immediate instead of an insert*128.
+ // We are still creating an INSERT_SUBVECTOR below with an undef node to
+ // extend the subvector to the size of the result vector. Make sure that
+ // we are not recursing on that node by checking for undef here.
+ if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+ Result.getOpcode() != ISD::UNDEF) {
+ EVT ResultVT = Result.getValueType();
+ SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(ResultVT);
+ SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+ Vec, ZeroIndex);
+
+ // The blend instruction, and therefore its mask, depend on the data type.
+ MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
+ if (ScalarType.isFloatingPoint()) {
+ // Choose either vblendps (float) or vblendpd (double).
+ unsigned ScalarSize = ScalarType.getSizeInBits();
+ assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+ unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+ SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+ // AVX2 is needed for 256-bit integer blend support.
+ // Integers must be cast to 32-bit because there is only vpblendd;
+ // vpblendw can't be used for this because it has a handicapped mask.
+
+ // If we don't have AVX2, then cast to float. Using a wrong domain blend
+ // is still more efficient than using the wrong domain vinsertf128 that
+ // will be created by InsertSubVector().
+ MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+ SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+ Result = DAG.getBitcast(CastVT, Result);
+ Vec256 = DAG.getBitcast(CastVT, Vec256);
+ Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+ return DAG.getBitcast(ResultVT, Vec256);
+ }
+
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Insert i1-subvector to i1-vector.
+static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
+ return Op;
+
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+ unsigned NumElems = OpVT.getVectorNumElements();
+ unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+
+ assert(IdxVal + SubVecNumElems <= NumElems &&
+ IdxVal % SubVecVT.getSizeInBits() == 0 &&
+ "Unexpected index value in INSERT_SUBVECTOR");
+
+ // There are 3 possible cases:
+ // 1. Subvector should be inserted in the lower part (IdxVal == 0)
+ // 2. Subvector should be inserted in the upper part
+ // (IdxVal + SubVecNumElems == NumElems)
+ // 3. Subvector should be inserted in the middle (for example v2i1
+ // to v16i1, index 2)
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(OpVT);
+ SDValue WideSubVec =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
+ if (Vec.isUndef())
+ return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
+ DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
+ }
+
+ if (IdxVal == 0) {
+ // Zero lower bits of the Vec
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ // Merge them together
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+ }
+
+ // Simple case when we put subvector in the upper part
+ if (IdxVal + SubVecNumElems == NumElems) {
+ // Zero upper bits of the Vec
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+ }
+ // Subvector should be inserted in the middle - use shuffle
+ SmallVector<int, 64> Mask;
+ for (unsigned i = 0; i < NumElems; ++i)
+ Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
+ i : i + NumElems);
+ return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+/// Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
+ SDValue Vec;
+ if (VT.is512BitVector()) {
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+ } else if (VT.is256BitVector()) {
+ if (Subtarget->hasInt256()) { // AVX2
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
+ } else { // AVX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+ }
+ } else if (VT.is128BitVector()) {
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ } else
+ llvm_unreachable("Unexpected vector type");
+
+ return DAG.getBitcast(VT, Vec);
+}
+
+/// Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+ Mask.push_back(i);
+ Mask.push_back(i + NumElems);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
+ Mask.push_back(i + Half);
+ Mask.push_back(i + NumElems + Half);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+ bool IsZero,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = V2.getSimpleValueType();
+ SDValue V1 = IsZero
+ ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec;
+ for (unsigned i = 0; i != NumElems; ++i)
+ // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec.push_back(i == Idx ? NumElems : i);
+ return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
+}
+
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// Returns true if the Mask could be calculated. Sets IsUnary to true if only
+/// uses one source. Note that this will set IsUnary for shuffles which use a
+/// single input multiple times, and in those cases it will
+/// adjust the mask to only have indices within that single input.
+/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
+ SmallVectorImpl<int> &Mask, bool &IsUnary) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue ImmN;
+
+ IsUnary = false;
+ bool IsFakeUnary = false;
+ switch(N->getOpcode()) {
+ case X86ISD::BLENDI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
+ case X86ISD::SHUFP:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::UNPCKH:
+ DecodeUNPCKHMask(VT, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::UNPCKL:
+ DecodeUNPCKLMask(VT, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVHLPS:
+ DecodeMOVHLPSMask(NumElems, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVLHPS:
+ DecodeMOVLHPSMask(NumElems, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::PALIGNR:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMILPI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFHW:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFLW:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFB: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ while (MaskNode->getOpcode() == ISD::BITCAST)
+ MaskNode = MaskNode->getOperand(0);
+
+ if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+ // If we have a build-vector, then things are easy.
+ MVT VT = MaskNode.getSimpleValueType();
+ assert(VT.isVector() &&
+ "Can't produce a non-vector with a build_vector!");
+ if (!VT.isInteger())
+ return false;
+
+ int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
+
+ SmallVector<uint64_t, 32> RawMask;
+ for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF) {
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ continue;
+ }
+ auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+ if (!CN)
+ return false;
+ APInt MaskElement = CN->getAPIntValue();
+
+ // We now have to decode the element which could be any integer size and
+ // extract each byte of it.
+ for (int j = 0; j < NumBytesPerElement; ++j) {
+ // Note that this is x86 and so always little endian: the low byte is
+ // the first byte of the mask.
+ RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
+ MaskElement = MaskElement.lshr(8);
+ }
+ }
+ DecodePSHUFBMask(RawMask, Mask);
+ break;
+ }
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return false;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return false;
+
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ DecodePSHUFBMask(C, Mask);
+ break;
+ }
+
+ return false;
+ }
+ case X86ISD::VPERMI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
+ break;
+ case X86ISD::VPERM2X128:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ // Mask only contains negative index if an element is zero.
+ if (std::any_of(Mask.begin(), Mask.end(),
+ [](int M){ return M == SM_SentinelZero; }))
+ return false;
+ break;
+ case X86ISD::MOVSLDUP:
+ DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSHDUP:
+ DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVLPS:
+ // Not yet implemented
+ return false;
+ case X86ISD::VPERMV: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(0);
+ while (MaskNode->getOpcode() == ISD::BITCAST)
+ MaskNode = MaskNode->getOperand(0);
+
+ unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
+ SmallVector<uint64_t, 32> RawMask;
+ if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+ // If we have a build-vector, then things are easy.
+ assert(MaskNode.getSimpleValueType().isInteger() &&
+ MaskNode.getSimpleValueType().getVectorNumElements() ==
+ VT.getVectorNumElements());
+
+ for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF)
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ else if (isa<ConstantSDNode>(Op)) {
+ APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
+ RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+ } else
+ return false;
+ }
+ DecodeVPERMVMask(RawMask, Mask);
+ break;
+ }
+ if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
+ unsigned NumEltsInMask = MaskNode->getNumOperands();
+ MaskNode = MaskNode->getOperand(0);
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
+ APInt MaskEltValue = CN->getAPIntValue();
+ for (unsigned i = 0; i < NumEltsInMask; ++i)
+ RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
+ DecodeVPERMVMask(RawMask, Mask);
+ break;
+ }
+ // It may be a scalar load
+ }
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return false;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return false;
+
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ DecodeVPERMVMask(C, VT, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMV3: {
+ IsUnary = false;
+ SDValue MaskNode = N->getOperand(1);
+ while (MaskNode->getOpcode() == ISD::BITCAST)
+ MaskNode = MaskNode->getOperand(1);
+
+ if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+ // If we have a build-vector, then things are easy.
+ assert(MaskNode.getSimpleValueType().isInteger() &&
+ MaskNode.getSimpleValueType().getVectorNumElements() ==
+ VT.getVectorNumElements());
+
+ SmallVector<uint64_t, 32> RawMask;
+ unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
+
+ for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF)
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ else {
+ auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+ if (!CN)
+ return false;
+ APInt MaskElement = CN->getAPIntValue();
+ RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+ }
+ }
+ DecodeVPERMV3Mask(RawMask, Mask);
+ break;
+ }
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return false;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return false;
+
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ DecodeVPERMV3Mask(C, VT, Mask);
+ break;
+ }
+ return false;
+ }
+ default: llvm_unreachable("unknown target shuffle node");
+ }
+
+ // Empty mask indicates the decode failed.
+ if (Mask.empty())
+ return false;
+
+ // If we have a fake unary shuffle, the shuffle mask is spread across two
+ // inputs that are actually the same node. Re-map the mask to always point
+ // into the first input.
+ if (IsFakeUnary)
+ for (int &M : Mask)
+ if (M >= (int)Mask.size())
+ M -= Mask.size();
+
+ return true;
+}
+
+/// Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+ unsigned Depth) {
+ if (Depth == 6)
+ return SDValue(); // Limit search depth.
+
+ SDValue V = SDValue(N, 0);
+ EVT VT = V.getValueType();
+ unsigned Opcode = V.getOpcode();
+
+ // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+ if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+ int Elt = SV->getMaskElt(Index);
+
+ if (Elt < 0)
+ return DAG.getUNDEF(VT.getVectorElementType());
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+ : SV->getOperand(1);
+ return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+ }
+
+ // Recurse into target specific vector shuffles to find scalars.
+ if (isTargetShuffle(Opcode)) {
+ MVT ShufVT = V.getSimpleValueType();
+ unsigned NumElems = ShufVT.getVectorNumElements();
+ SmallVector<int, 16> ShuffleMask;
+ bool IsUnary;
+
+ if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+ return SDValue();
+
+ int Elt = ShuffleMask[Index];
+ if (Elt < 0)
+ return DAG.getUNDEF(ShufVT.getVectorElementType());
+
+ SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+ : N->getOperand(1);
+ return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+ Depth+1);
+ }
+
+ // Actual nodes that may contain scalar elements
+ if (Opcode == ISD::BITCAST) {
+ V = V.getOperand(0);
+ EVT SrcVT = V.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+ return SDValue();
+ }
+
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return (Index == 0) ? V.getOperand(0)
+ : DAG.getUNDEF(VT.getVectorElementType());
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR)
+ return V.getOperand(Index);
+
+ return SDValue();
+}
+
+/// Custom lower build_vector of v16i8.
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget* Subtarget,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 8)
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue V;
+ bool First = true;
+
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget->hasSSE41()) {
+ for (unsigned i = 0; i < 16; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v16i8);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v16i8, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+ }
+
+ return V;
+ }
+
+ // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
+ for (unsigned i = 0; i < 16; ++i) {
+ bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+ if (ThisIsNonZero && First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+
+ if ((i & 1) != 0) {
+ SDValue ThisElt, LastElt;
+ bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ if (LastIsNonZero) {
+ LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+ MVT::i16, Op.getOperand(i-1));
+ }
+ if (ThisIsNonZero) {
+ ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+ ThisElt, DAG.getConstant(8, dl, MVT::i8));
+ if (LastIsNonZero)
+ ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+ } else
+ ThisElt = LastElt;
+
+ if (ThisElt.getNode())
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i/2, dl));
+ }
+ }
+
+ return DAG.getBitcast(MVT::v16i8, V);
+}
+
+/// Custom lower build_vector of v8i16.
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget* Subtarget,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 4)
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue V;
+ bool First = true;
+ for (unsigned i = 0; i < 8; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v8i16, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+ }
+
+ return V;
+}
+
+/// Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget,
+ const TargetLowering &TLI) {
+ // Find all zeroable elements.
+ std::bitset<4> Zeroable;
+ for (int i=0; i < 4; ++i) {
+ SDValue Elt = Op->getOperand(i);
+ Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
+ }
+ assert(Zeroable.size() - Zeroable.count() > 1 &&
+ "We expect at least two non-zero elements!");
+
+ // We only know how to deal with build_vector nodes where elements are either
+ // zeroable or extract_vector_elt with constant index.
+ SDValue FirstNonZero;
+ unsigned FirstNonZeroIdx;
+ for (unsigned i=0; i < 4; ++i) {
+ if (Zeroable[i])
+ continue;
+ SDValue Elt = Op->getOperand(i);
+ if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Elt.getOperand(1)))
+ return SDValue();
+ // Make sure that this node is extracting from a 128-bit vector.
+ MVT VT = Elt.getOperand(0).getSimpleValueType();
+ if (!VT.is128BitVector())
+ return SDValue();
+ if (!FirstNonZero.getNode()) {
+ FirstNonZero = Elt;
+ FirstNonZeroIdx = i;
+ }
+ }
+
+ assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+ SDValue V1 = FirstNonZero.getOperand(0);
+ MVT VT = V1.getSimpleValueType();
+
+ // See if this build_vector can be lowered as a blend with zero.
+ SDValue Elt;
+ unsigned EltMaskIdx, EltIdx;
+ int Mask[4];
+ for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+ if (Zeroable[EltIdx]) {
+ // The zero vector will be on the right hand side.
+ Mask[EltIdx] = EltIdx+4;
+ continue;
+ }
+
+ Elt = Op->getOperand(EltIdx);
+ // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+ EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+ if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+ break;
+ Mask[EltIdx] = EltIdx;
+ }
+
+ if (EltIdx == 4) {
+ // Let the shuffle legalizer deal with blend operations.
+ SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ if (V1.getSimpleValueType() != VT)
+ V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+ }
+
+ // See if we can lower this build_vector to a INSERTPS.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ SDValue V2 = Elt.getOperand(0);
+ if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+ V1 = SDValue();
+
+ bool CanFold = true;
+ for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+ if (Zeroable[i])
+ continue;
+
+ SDValue Current = Op->getOperand(i);
+ SDValue SrcVector = Current->getOperand(0);
+ if (!V1.getNode())
+ V1 = SrcVector;
+ CanFold = SrcVector == V1 &&
+ cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+ }
+
+ if (!CanFold)
+ return SDValue();
+
+ assert(V1.getNode() && "Expected at least two non-zero elements!");
+ if (V1.getSimpleValueType() != MVT::v4f32)
+ V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+ if (V2.getSimpleValueType() != MVT::v4f32)
+ V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
+
+ // Ok, we can emit an INSERTPS instruction.
+ unsigned ZMask = Zeroable.to_ulong();
+
+ unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ SDLoc DL(Op);
+ SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getIntPtrConstant(InsertPSMask, DL));
+ return DAG.getBitcast(VT, Result);
+}
+
+/// Return a vector logical shift node.
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+ unsigned NumBits, SelectionDAG &DAG,
+ const TargetLowering &TLI, SDLoc dl) {
+ assert(VT.is128BitVector() && "Unknown type for VShift");
+ MVT ShVT = MVT::v2i64;
+ unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+ SrcOp = DAG.getBitcast(ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+ SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+ return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+}
+
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
+
+ // Check if the scalar load can be widened into a vector load. And if
+ // the address is "base + cst" see if the cst can be "absorbed" into
+ // the shuffle mask.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+ SDValue Ptr = LD->getBasePtr();
+ if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+ return SDValue();
+ EVT PVT = LD->getValueType(0);
+ if (PVT != MVT::i32 && PVT != MVT::f32)
+ return SDValue();
+
+ int FI = -1;
+ int64_t Offset = 0;
+ if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+ FI = FINode->getIndex();
+ Offset = 0;
+ } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+ isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+ FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+ Offset = Ptr.getConstantOperandVal(1);
+ Ptr = Ptr.getOperand(0);
+ } else {
+ return SDValue();
+ }
+
+ // FIXME: 256-bit vector instructions don't require a strict alignment,
+ // improve this code to support it better.
+ unsigned RequiredAlign = VT.getSizeInBits()/8;
+ SDValue Chain = LD->getChain();
+ // Make sure the stack object alignment is at least 16 or 32.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+ if (MFI->isFixedObjectIndex(FI)) {
+ // Can't change the alignment. FIXME: It's possible to compute
+ // the exact stack offset and reference FI + adjust offset instead.
+ // If someone *really* cares about this. That's the way to implement it.
+ return SDValue();
+ } else {
+ MFI->setObjectAlignment(FI, RequiredAlign);
+ }
+ }
+
+ // (Offset % 16 or 32) must be multiple of 4. Then address is then
+ // Ptr + (Offset & ~15).
+ if (Offset < 0)
+ return SDValue();
+ if ((Offset % RequiredAlign) & 3)
+ return SDValue();
+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+ if (StartOffset) {
+ SDLoc DL(Ptr);
+ Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
+ }
+
+ int EltNo = (Offset - StartOffset) >> 2;
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+ SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+ LD->getPointerInfo().getWithOffset(StartOffset),
+ false, false, false, 0);
+
+ SmallVector<int, 8> Mask(NumElems, EltNo);
+
+ return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+ }
+
+ return SDValue();
+}
+
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+///
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
+ SDLoc &DL, SelectionDAG &DAG,
+ bool isAfterLegalize) {
+ unsigned NumElems = Elts.size();
+
+ LoadSDNode *LDBase = nullptr;
+ unsigned LastLoadedElt = -1U;
+
+ // For each element in the initializer, see if we've found a load or an undef.
+ // If we don't find an initial load element, or later load elements are
+ // non-consecutive, bail out.
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Elts[i];
+ // Look through a bitcast.
+ if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+ Elt = Elt.getOperand(0);
+ if (!Elt.getNode() ||
+ (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+ return SDValue();
+ if (!LDBase) {
+ if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+ return SDValue();
+ LDBase = cast<LoadSDNode>(Elt.getNode());
+ LastLoadedElt = i;
+ continue;
+ }
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+
+ LoadSDNode *LD = cast<LoadSDNode>(Elt);
+ EVT LdVT = Elt.getValueType();
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+ return SDValue();
+ if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
+ return SDValue();
+ LastLoadedElt = i;
+ }
+
+ // If we have found an entire vector of loads and undefs, then return a large
+ // load of the entire vector width starting at the base pointer. If we found
+ // consecutive loads for the low half, generate a vzext_load node.
+ if (LastLoadedElt == NumElems - 1) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
+
+ if (isAfterLegalize &&
+ !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+ return SDValue();
+
+ SDValue NewLd = SDValue();
+
+ NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(), LDBase->isVolatile(),
+ LDBase->isNonTemporal(), LDBase->isInvariant(),
+ LDBase->getAlignment());
+
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ }
+
+ return NewLd;
+ }
+
+ //TODO: The code below fires only for for loading the low v2i32 / v2f32
+ //of a v4i32 / v4f32. It's probably worth generalizing.
+ EVT EltVT = VT.getVectorElementType();
+ if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+ SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
+ LDBase->getPointerInfo(),
+ LDBase->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+
+ // Make sure the newly-created LOAD is in the same position as LDBase in
+ // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+ // update uses of LDBase's output chain to use the TokenFactor.
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
+
+ return DAG.getBitcast(VT, ResNode);
+ }
+ return SDValue();
+}
+
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+ SelectionDAG &DAG) {
+ // VBROADCAST requires AVX.
+ // TODO: Splats could be generated for non-AVX CPUs using SSE
+ // instructions, but there's less potential gain for only 128-bit vectors.
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+ "Unsupported vector type for broadcast.");
+
+ SDValue Ld;
+ bool ConstSplatVal;
+
+ switch (Op.getOpcode()) {
+ default:
+ // Unknown pattern found.
+ return SDValue();
+
+ case ISD::BUILD_VECTOR: {
+ auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+ BitVector UndefElements;
+ SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+ // We need a splat of a single value to use broadcast, and it doesn't
+ // make any sense if the value is only in one element of the vector.
+ if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
+ return SDValue();
+
+ Ld = Splat;
+ ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+ Ld.getOpcode() == ISD::ConstantFP);
+
+ // Make sure that all of the users of a non-constant load are from the
+ // BUILD_VECTOR node.
+ if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+ return SDValue();
+ break;
+ }
+
+ case ISD::VECTOR_SHUFFLE: {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+ // Shuffles must have a splat mask where the first element is
+ // broadcasted.
+ if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+ return SDValue();
+
+ SDValue Sc = Op.getOperand(0);
+ if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+ Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+ if (!Subtarget->hasInt256())
+ return SDValue();
+
+ // Use the register form of the broadcast instruction available on AVX2.
+ if (VT.getSizeInBits() >= 256)
+ Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ }
+
+ Ld = Sc.getOperand(0);
+ ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+ Ld.getOpcode() == ISD::ConstantFP);
+
+ // The scalar_to_vector node and the suspected
+ // load node must have exactly one user.
+ // Constants may have multiple users.
+
+ // AVX-512 has register version of the broadcast
+ bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+ Ld.getValueType().getSizeInBits() >= 32;
+ if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
+ !hasRegVer))
+ return SDValue();
+ break;
+ }
+ }
+
+ unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+ bool IsGE256 = (VT.getSizeInBits() >= 256);
+
+ // When optimizing for size, generate up to 5 extra bytes for a broadcast
+ // instruction to save 8 or more bytes of constant pool data.
+ // TODO: If multiple splats are generated to load the same constant,
+ // it may be detrimental to overall size. There needs to be a way to detect
+ // that condition to know if this is truly a size win.
+ bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+ // Handle broadcasting a single constant scalar from the constant pool
+ // into a vector.
+ // On Sandybridge (no AVX2), it is still better to load a constant vector
+ // from the constant pool and not to broadcast it from a scalar.
+ // But override that restriction when optimizing for size.
+ // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+ if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
+ EVT CVT = Ld.getValueType();
+ assert(!CVT.isVector() && "Must not broadcast a vector type");
+
+ // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // For size optimization, also splat v2f64 and v2i64, and for size opt
+ // with AVX2, also splat i8 and i16.
+ // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
+ const Constant *C = nullptr;
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+ C = CI->getConstantIntValue();
+ else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+ C = CF->getConstantFPValue();
+
+ assert(C && "Invalid constant type");
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue CP =
+ DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, Alignment);
+
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ }
+ }
+
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+
+ // Handle AVX2 in-register broadcasts.
+ if (!IsLoad && Subtarget->hasInt256() &&
+ (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The scalar source must be a normal load.
+ if (!IsLoad)
+ return SDValue();
+
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (Subtarget->hasVLX() && ScalarSize == 64))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+ // double since there is no vbroadcastsd xmm
+ if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+ if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ }
+
+ // Unsupported broadcast.
+ return SDValue();
+}
+
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+ SDValue ExtIdx) {
+ int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+ return Idx;
+
+ // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+ // lowered this:
+ // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+ // to:
+ // (extract_vector_elt (vector_shuffle<2,u,u,u>
+ // (extract_subvector (v8f32 %vreg0), Constant<4>),
+ // undef)
+ // Constant<0>)
+ // In this case the vector is the extract_subvector expression and the index
+ // is 2, as specified by the shuffle.
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+ SDValue ShuffleVec = SVOp->getOperand(0);
+ MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+ assert(ShuffleVecVT.getVectorElementType() ==
+ ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+ int ShuffleIdx = SVOp->getMaskElt(Idx);
+ if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+ ExtractedFromVec = ShuffleVec;
+ return ShuffleIdx;
+ }
+ return Idx;
+}
+
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ // Skip if insert_vec_elt is not supported.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+ return SDValue();
+
+ SDLoc DL(Op);
+ unsigned NumElems = Op.getNumOperands();
+
+ SDValue VecIn1;
+ SDValue VecIn2;
+ SmallVector<unsigned, 4> InsertIndices;
+ SmallVector<int, 8> Mask(NumElems, -1);
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Opc = Op.getOperand(i).getOpcode();
+
+ if (Opc == ISD::UNDEF)
+ continue;
+
+ if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+ // Quit if more than 1 elements need inserting.
+ if (InsertIndices.size() > 1)
+ return SDValue();
+
+ InsertIndices.push_back(i);
+ continue;
+ }
+
+ SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+ SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+ int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+ // Quit if extracted from vector of different type.
+ if (ExtractedFromVec.getValueType() != VT)
+ return SDValue();
+
+ if (!VecIn1.getNode())
+ VecIn1 = ExtractedFromVec;
+ else if (VecIn1 != ExtractedFromVec) {
+ if (!VecIn2.getNode())
+ VecIn2 = ExtractedFromVec;
+ else if (VecIn2 != ExtractedFromVec)
+ // Quit if more than 2 vectors to shuffle
+ return SDValue();
+ }
+
+ if (ExtractedFromVec == VecIn1)
+ Mask[i] = Idx;
+ else if (ExtractedFromVec == VecIn2)
+ Mask[i] = Idx + NumElems;
+ }
+
+ if (!VecIn1.getNode())
+ return SDValue();
+
+ VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+ SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+ for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+ unsigned Idx = InsertIndices[i];
+ NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+ DAG.getIntPtrConstant(Idx, DL));
+ }
+
+ return NV;
+}
+
+static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
+ assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+ Op.getScalarValueSizeInBits() == 1 &&
+ "Can not convert non-constant vector");
+ uint64_t Immediate = 0;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.getOpcode() != ISD::UNDEF)
+ Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ }
+ SDLoc dl(Op);
+ MVT VT =
+ MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
+ return DAG.getConstant(Immediate, dl, VT);
+}
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+ MVT VT = Op.getSimpleValueType();
+ assert((VT.getVectorElementType() == MVT::i1) &&
+ "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+ SDLoc dl(Op);
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+ }
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+ }
+
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
+ if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, Imm);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Vector has one or more non-const elements
+ uint64_t Immediate = 0;
+ SmallVector<unsigned, 16> NonConstIdx;
+ bool IsSplat = true;
+ bool HasConstElts = false;
+ int SplatIdx = -1;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.getOpcode() == ISD::UNDEF)
+ continue;
+ if (!isa<ConstantSDNode>(In))
+ NonConstIdx.push_back(idx);
+ else {
+ Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ HasConstElts = true;
+ }
+ if (SplatIdx == -1)
+ SplatIdx = idx;
+ else if (In != Op.getOperand(SplatIdx))
+ IsSplat = false;
+ }
+
+ // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
+ if (IsSplat)
+ return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
+ DAG.getConstant(1, dl, VT),
+ DAG.getConstant(0, dl, VT));
+
+ // insert elements one by one
+ SDValue DstVec;
+ SDValue Imm;
+ if (Immediate) {
+ MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
+ Imm = DAG.getConstant(Immediate, dl, ImmVT);
+ }
+ else if (HasConstElts)
+ Imm = DAG.getConstant(0, dl, VT);
+ else
+ Imm = DAG.getUNDEF(VT);
+ if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+ DstVec = DAG.getBitcast(VT, Imm);
+ else {
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+ DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
+ unsigned InsertIdx = NonConstIdx[i];
+ DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(InsertIdx),
+ DAG.getIntPtrConstant(InsertIdx, dl));
+ }
+ return DstVec;
+}
+
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+ SelectionDAG &DAG,
+ unsigned BaseIdx, unsigned LastIdx,
+ SDValue &V0, SDValue &V1) {
+ EVT VT = N->getValueType(0);
+
+ assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+ assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+ "Invalid Vector in input!");
+
+ bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+ bool CanFold = true;
+ unsigned ExpectedVExtractIdx = BaseIdx;
+ unsigned NumElts = LastIdx - BaseIdx;
+ V0 = DAG.getUNDEF(VT);
+ V1 = DAG.getUNDEF(VT);
+
+ // Check if N implements a horizontal binop.
+ for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+ SDValue Op = N->getOperand(i + BaseIdx);
+
+ // Skip UNDEFs.
+ if (Op->getOpcode() == ISD::UNDEF) {
+ // Update the expected vector extract index.
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ ExpectedVExtractIdx += 2;
+ continue;
+ }
+
+ CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+ if (!CanFold)
+ break;
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+ CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOperand(0) == Op1.getOperand(0) &&
+ isa<ConstantSDNode>(Op0.getOperand(1)) &&
+ isa<ConstantSDNode>(Op1.getOperand(1)));
+ if (!CanFold)
+ break;
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+ if (i * 2 < NumElts) {
+ if (V0.getOpcode() == ISD::UNDEF) {
+ V0 = Op0.getOperand(0);
+ if (V0.getValueType() != VT)
+ return false;
+ }
+ } else {
+ if (V1.getOpcode() == ISD::UNDEF) {
+ V1 = Op0.getOperand(0);
+ if (V1.getValueType() != VT)
+ return false;
+ }
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ }
+
+ SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+ if (I0 == ExpectedVExtractIdx)
+ CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+ else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+ // Try to match the following dag sequence:
+ // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+ CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+ } else
+ CanFold = false;
+
+ ExpectedVExtractIdx += 2;
+ }
+
+ return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations.
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V0_HI
+/// HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V1_LO
+/// HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+ SDLoc DL, SelectionDAG &DAG,
+ unsigned X86Opcode, bool Mode,
+ bool isUndefLO, bool isUndefHI) {
+ EVT VT = V0.getValueType();
+ assert(VT.is256BitVector() && VT == V1.getValueType() &&
+ "Invalid nodes in input!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
+ SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
+ SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
+ SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
+ EVT NewVT = V0_LO.getValueType();
+
+ SDValue LO = DAG.getUNDEF(NewVT);
+ SDValue HI = DAG.getUNDEF(NewVT);
+
+ if (Mode) {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+ if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+ } else {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
+ V1_LO->getOpcode() != ISD::UNDEF))
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+ if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
+ V1_HI->getOpcode() != ISD::UNDEF))
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
+/// node.
+static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ MVT VT = BV->getSimpleValueType(0);
+ if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
+
+ SDLoc DL(BV);
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue InVec0 = DAG.getUNDEF(VT);
+ SDValue InVec1 = DAG.getUNDEF(VT);
+
+ assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+ // Odd-numbered elements in the input build vector are obtained from
+ // adding two integer/float elements.
+ // Even-numbered elements in the input build vector are obtained from
+ // subtracting two integer/float elements.
+ unsigned ExpectedOpcode = ISD::FSUB;
+ unsigned NextExpectedOpcode = ISD::FADD;
+ bool AddFound = false;
+ bool SubFound = false;
+
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ SDValue Op = BV->getOperand(i);
+
+ // Skip 'undef' values.
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::UNDEF) {
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ continue;
+ }
+
+ // Early exit if we found an unexpected opcode.
+ if (Opcode != ExpectedOpcode)
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+ // Early exit if we cannot match that sequence.
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+ !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+ Op0.getOperand(1) != Op1.getOperand(1))
+ return SDValue();
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ if (I0 != i)
+ return SDValue();
+
+ // We found a valid add/sub node. Update the information accordingly.
+ if (i & 1)
+ AddFound = true;
+ else
+ SubFound = true;
+
+ // Update InVec0 and InVec1.
+ if (InVec0.getOpcode() == ISD::UNDEF) {
+ InVec0 = Op0.getOperand(0);
+ if (InVec0.getSimpleValueType() != VT)
+ return SDValue();
+ }
+ if (InVec1.getOpcode() == ISD::UNDEF) {
+ InVec1 = Op1.getOperand(0);
+ if (InVec1.getSimpleValueType() != VT)
+ return SDValue();
+ }
+
+ // Make sure that operands in input to each add/sub node always
+ // come from a same pair of vectors.
+ if (InVec0 != Op0.getOperand(0)) {
+ if (ExpectedOpcode == ISD::FSUB)
+ return SDValue();
+
+ // FADD is commutable. Try to commute the operands
+ // and then test again.
+ std::swap(Op0, Op1);
+ if (InVec0 != Op0.getOperand(0))
+ return SDValue();
+ }
+
+ if (InVec1 != Op1.getOperand(0))
+ return SDValue();
+
+ // Update the pair of expected opcodes.
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ }
+
+ // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
+ if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
+ InVec1.getOpcode() != ISD::UNDEF)
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+
+ return SDValue();
+}
+
+/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = BV->getSimpleValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumUndefsLO = 0;
+ unsigned NumUndefsHI = 0;
+ unsigned Half = NumElts/2;
+
+ // Count the number of UNDEF operands in the build_vector in input.
+ for (unsigned i = 0, e = Half; i != e; ++i)
+ if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+ NumUndefsLO++;
+
+ for (unsigned i = Half, e = NumElts; i != e; ++i)
+ if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+ NumUndefsHI++;
+
+ // Early exit if this is either a build_vector of all UNDEFs or all the
+ // operands but one are UNDEF.
+ if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+ return SDValue();
+
+ SDLoc DL(BV);
+ SDValue InVec0, InVec1;
+ if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+ // Try to match an SSE3 float HADD/HSUB.
+ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+ } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+ // Try to match an SSSE3 integer HADD/HSUB.
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+ return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+ }
+
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
+ if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+ // Try to match an AVX horizontal add/sub of packed single/double
+ // precision floating point values from 256-bit vectors.
+ SDValue InVec2, InVec3;
+ if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+ if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+ } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+ // Try to match an AVX2 horizontal add/sub of signed integers.
+ SDValue InVec2, InVec3;
+ unsigned X86Opcode;
+ bool CanFold = true;
+
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+ ((InVec0.getOpcode() == ISD::UNDEF ||
+ InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+ ((InVec1.getOpcode() == ISD::UNDEF ||
+ InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HSUB;
+ else
+ CanFold = false;
+
+ if (CanFold) {
+ // Fold this build_vector into a single horizontal add/sub.
+ // Do this only if the target has AVX2.
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+
+ // Do not try to expand this build_vector into a pair of horizontal
+ // add/sub if we can emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into a pair of horizontal binop followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+ isUndefLO, isUndefHI);
+ }
+ }
+
+ if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+ VT == MVT::v16i16) && Subtarget->hasAVX()) {
+ unsigned X86Opcode;
+ if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HSUB;
+ else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::FHADD;
+ else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::FHSUB;
+ else
+ return SDValue();
+
+ // Don't try to expand this build_vector into a pair of horizontal add/sub
+ // if we can simply emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into two horizontal add/sub followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+ isUndefLO, isUndefHI);
+ }
+
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ MVT VT = Op.getSimpleValueType();
+ MVT ExtVT = VT.getVectorElementType();
+ unsigned NumElems = Op.getNumOperands();
+
+ // Generate vectors for predicate vectors.
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
+ return LowerBUILD_VECTORvXi1(Op, DAG);
+
+ // Vectors containing all zeros can be matched by pxor and xorps later
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+ // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
+ return Op;
+
+ return getZeroVector(VT, Subtarget, DAG, dl);
+ }
+
+ // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+ // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+ // vpcmpeqd on 256-bit vectors.
+ if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+ return Op;
+
+ if (!VT.is512BitVector())
+ return getOnesVector(VT, Subtarget, DAG, dl);
+ }
+
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+ if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
+ return Broadcast;
+
+ unsigned EVTBits = ExtVT.getSizeInBits();
+
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ uint64_t NonZeros = 0;
+ bool IsAllConstants = true;
+ SmallSet<SDValue, 8> Values;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+ Values.insert(Elt);
+ if (Elt.getOpcode() != ISD::Constant &&
+ Elt.getOpcode() != ISD::ConstantFP)
+ IsAllConstants = false;
+ if (X86::isZeroNode(Elt))
+ NumZero++;
+ else {
+ assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+ NonZeros |= ((uint64_t)1 << i);
+ NumNonZero++;
+ }
+ }
+
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NumNonZero == 0)
+ return DAG.getUNDEF(VT);
+
+ // Special case for single non-zero, non-undef, element.
+ if (NumNonZero == 1) {
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+
+ // If this is an insertion of an i64 value on x86-32, and if the top bits of
+ // the value are obviously zero, truncate the value to i32 and do the
+ // insertion that way. Only do this if the value is non-constant or if the
+ // value is a constant being inserted into element 0. It is cheaper to do
+ // a constant pool load than it is to do a movd + shuffle.
+ if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+ (!IsAllConstants || Idx == 0)) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ // Handle SSE only.
+ assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+ MVT VecVT = MVT::v4i32;
+
+ // Truncate the value (which may itself be a constant) to i32, and
+ // convert it to a vector with movd (S2V+shuffle to zero extend).
+ Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+ return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+ Item, Idx * 2, true, Subtarget, DAG));
+ }
+ }
+
+ // If we have a constant or non-constant insertion into the low element of
+ // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+ // the rest of the elements. This will be matched as movd/movq/movss/movsd
+ // depending on what the source datatype is.
+ if (Idx == 0) {
+ if (NumZero == 0)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+ (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+ if (VT.is512BitVector()) {
+ SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+ Item, DAG.getIntPtrConstant(0, dl));
+ }
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
+ if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+ Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+ if (VT.is256BitVector()) {
+ if (Subtarget->hasAVX()) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ } else {
+ // Without AVX, we need to extend to a 128-bit vector and then
+ // insert into the 256-bit vector.
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ }
+ } else {
+ assert(VT.is128BitVector() && "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+ return DAG.getBitcast(VT, Item);
+ }
+ }
+
+ // Is it a vector logical left shift?
+ if (NumElems == 2 && Idx == 1 &&
+ X86::isZeroNode(Op.getOperand(0)) &&
+ !X86::isZeroNode(Op.getOperand(1))) {
+ unsigned NumBits = VT.getSizeInBits();
+ return getVShift(true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ VT, Op.getOperand(1)),
+ NumBits/2, DAG, *this, dl);
+ }
+
+ if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+ return SDValue();
+
+ // Otherwise, if this is a vector with i32 or f32 elements, and the element
+ // is a non-constant being inserted into an element other than the low one,
+ // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
+ // movd/movss) to move this into the low element, then shuffle it into
+ // place.
+ if (EVTBits == 32) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+ }
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1) {
+ if (EVTBits == 32) {
+ // Instead of a shuffle like this:
+ // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+ // Check if it's possible to issue this instead.
+ // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+ if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+ return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+ }
+ return SDValue();
+ }
+
+ // A vector full of immediates; various special cases are already
+ // handled, so this is best done with a single constant-pool load.
+ if (IsAllConstants)
+ return SDValue();
+
+ // For AVX-length vectors, see if we can use a vector load to get all of the
+ // elements, otherwise build the individual 128-bit pieces and use
+ // shuffles to put them in place.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
+
+ // Check for a build vector of consecutive loads.
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+ return LD;
+
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+ // Build both the lower and upper subvector.
+ SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[0], NumElems/2));
+ SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[NumElems / 2], NumElems/2));
+
+ // Recreate the wider vector with the lower and upper part.
+ if (VT.is256BitVector())
+ return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ }
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64) {
+ if (NumNonZero == 1) {
+ // One half is zero or undef.
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+ Op.getOperand(Idx));
+ return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+ }
+ return SDValue();
+ }
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16)
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+ DAG, Subtarget, *this))
+ return V;
+
+ if (EVTBits == 16 && NumElems == 8)
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+ DAG, Subtarget, *this))
+ return V;
+
+ // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+ if (EVTBits == 32 && NumElems == 4)
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+ return V;
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ SmallVector<SDValue, 8> V(NumElems);
+ if (NumElems == 4 && NumZero > 0) {
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !(NonZeros & (1ULL << i));
+ if (isZero)
+ V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+ default: break;
+ case 0:
+ V[i] = V[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+ break;
+ case 2:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ case 3:
+ V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ }
+ }
+
+ bool Reverse1 = (NonZeros & 0x3) == 2;
+ bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ int MaskVec[] = {
+ Reverse1 ? 1 : 0,
+ Reverse1 ? 0 : 1,
+ static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+ static_cast<int>(Reverse2 ? NumElems : NumElems+1)
+ };
+ return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+ }
+
+ if (Values.size() > 1 && VT.is128BitVector()) {
+ // Check for a build vector of consecutive loads.
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = Op.getOperand(i);
+
+ // Check for elements which are consecutive loads.
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+ return LD;
+
+ // Check for a build vector from mostly shuffle plus few inserting.
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
+ return Sh;
+
+ // For SSE 4.1, use insertps to put the high elements into the low element.
+ if (Subtarget->hasSSE41()) {
+ SDValue Result;
+ if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+ else
+ Result = DAG.getUNDEF(VT);
+
+ for (unsigned i = 1; i < NumElems; ++i) {
+ if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
+ }
+ return Result;
+ }
+
+ // Otherwise, expand into a number of unpckl*, start by extending each of
+ // our (non-undef) elements to the full vector width with the element in the
+ // bottom slot of the vector (which generates no code for SSE).
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ else
+ V[i] = DAG.getUNDEF(VT);
+ }
+
+ // Next, we iteratively mix elements, e.g. for v4f32:
+ // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+ // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+ // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
+ unsigned EltStride = NumElems >> 1;
+ while (EltStride != 0) {
+ for (unsigned i = 0; i < EltStride; ++i) {
+ // If V[i+EltStride] is undef and this is the first round of mixing,
+ // then it is safe to just drop this shuffle: V[i] is already in the
+ // right place, the one element (since it's the first round) being
+ // inserted as undef can be dropped. This isn't safe for successive
+ // rounds because they will permute elements within both vectors.
+ if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+ EltStride == NumElems/2)
+ continue;
+
+ V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+ }
+ EltStride >>= 1;
+ }
+ return V[0];
+ }
+ return SDValue();
+}
+
+// 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+
+ assert((ResVT.is256BitVector() ||
+ ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ if (ResVT.is256BitVector())
+ return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+ if (Op.getNumOperands() == 4) {
+ MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+ ResVT.getVectorNumElements()/2);
+ SDValue V3 = Op.getOperand(2);
+ SDValue V4 = Op.getOperand(3);
+ return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
+ Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+ }
+ return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG & DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ unsigned NumOfOperands = Op.getNumOperands();
+
+ assert(isPowerOf2_32(NumOfOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ SDValue Undef = DAG.getUNDEF(ResVT);
+ if (NumOfOperands > 2) {
+ // Specialize the cases when all, or all but one, of the operands are undef.
+ unsigned NumOfDefinedOps = 0;
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < NumOfOperands; i++)
+ if (!Op.getOperand(i).isUndef()) {
+ NumOfDefinedOps++;
+ OpIdx = i;
+ }
+ if (NumOfDefinedOps == 0)
+ return Undef;
+ if (NumOfDefinedOps == 1) {
+ unsigned SubVecNumElts =
+ Op.getOperand(OpIdx).getValueType().getVectorNumElements();
+ SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
+ Op.getOperand(OpIdx), IdxVal);
+ }
+
+ MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+ ResVT.getVectorNumElements()/2);
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0; i < NumOfOperands/2; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ Ops.clear();
+ for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ // 2 operands
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ assert(V1.getValueType() == V2.getValueType() &&
+ V1.getValueType().getVectorNumElements() == NumElems/2 &&
+ "Unexpected operands in CONCAT_VECTORS");
+
+ if (ResVT.getSizeInBits() >= 16)
+ return Op; // The operation is legal with KUNPCK
+
+ bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+ SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
+ if (IsZeroV1 && IsZeroV2)
+ return ZeroVec;
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ if (V2.isUndef())
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ if (IsZeroV2)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
+
+ SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
+ if (V1.isUndef())
+ V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+
+ if (IsZeroV1)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
+
+ V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
+ assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+ (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+ Op.getNumOperands() == 4)));
+
+ // AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // from two other 128-bit ones.
+
+ // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
+ return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] != -1 && Mask[i] != i)
+ return false;
+ return true;
+}
+
+/// \brief Helper function to classify a mask as a single-input mask.
+///
+/// This isn't a generic single-input test because in the vector shuffle
+/// lowering we canonicalize single inputs to be the first input operand. This
+/// means we can more quickly test for a single input by only checking whether
+/// an input from the second operand exists. We also assume that the size of
+/// mask corresponds to the size of the input vectors which isn't true in the
+/// fully general case.
+static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
+ for (int M : Mask)
+ if (M >= (int)Mask.size())
+ return false;
+ return true;
+}
+
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ return true;
+ return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ RepeatedMask.resize(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ if (RepeatedMask[i % LaneSize] == -1)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+ else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
+/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ if (Mask.size() != ExpectedMask.size())
+ return false;
+
+ int Size = Mask.size();
+
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (!MaskBV || !ExpectedBV ||
+ MaskBV->getOperand(Mask[i] % Size) !=
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+ SelectionDAG &DAG) {
+ assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+ assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+ unsigned Imm = 0;
+ Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
+ Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
+ Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
+ Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
+ return DAG.getConstant(Imm, DL, MVT::i8);
+}
+
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2) {
+ SmallBitVector Zeroable(Mask.size(), false);
+
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1->getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2->getOperand(0);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ Zeroable[i] = true;
+ continue;
+ }
+
+ // If this is an index into a build_vector node (which has the same number
+ // of elements), dig out the input value and use it.
+ SDValue V = M < Size ? V1 : V2;
+ if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+ continue;
+
+ SDValue Input = V.getOperand(M % Size);
+ // The UNDEF opcode check really should be dead code here, but not quite
+ // worth asserting on (it isn't invalid, just unexpected).
+ if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+ Zeroable[i] = true;
+ }
+
+ return Zeroable;
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ SmallVector<int, 8> Unpckl;
+ SmallVector<int, 8> Unpckh;
+
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
+ int HiPos = LoPos + NumEltsInLane / 2;
+ Unpckl.push_back(LoPos);
+ Unpckh.push_back(HiPos);
+ }
+
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+ // Commute and try again.
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+ return SDValue();
+}
+
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT EltVT = VT.getVectorElementType();
+ int NumEltBits = EltVT.getSizeInBits();
+ MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+ SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+ IntEltVT);
+ if (EltVT.isFloatingPoint()) {
+ Zero = DAG.getBitcast(EltVT, Zero);
+ AllOnes = DAG.getBitcast(EltVT, AllOnes);
+ }
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ V = DAG.getNode(VT.isFloatingPoint()
+ ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+ DL, VT, V, VMask);
+ return V;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.isInteger() && "Only supports integer vector types!");
+ MVT EltVT = VT.getVectorElementType();
+ int NumEltBits = EltVT.getSizeInBits();
+ SDValue Zero = DAG.getConstant(0, DL, EltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+ EltVT);
+ SmallVector<SDValue, 16> MaskOps;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+ return SDValue(); // Shuffled input!
+ MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+ }
+
+ SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+ // We have to cast V2 around.
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+ DAG.getBitcast(MaskVT, V1Mask),
+ DAG.getBitcast(MaskVT, V2)));
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+ SmallVector<int, 8> Mask(Original.begin(), Original.end());
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ bool ForceV1Zero = false, ForceV2Zero = false;
+
+ // Attempt to generate the binary blend mask. If an input is zero then
+ // we can use any lane.
+ // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
+ unsigned BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ if (M == i)
+ continue;
+ if (M == i + Size) {
+ BlendMask |= 1u << i;
+ continue;
+ }
+ if (Zeroable[i]) {
+ if (V1IsZero) {
+ ForceV1Zero = true;
+ Mask[i] = i;
+ continue;
+ }
+ if (V2IsZero) {
+ ForceV2Zero = true;
+ BlendMask |= 1u << i;
+ Mask[i] = i + Size;
+ continue;
+ }
+ }
+ return SDValue(); // Shuffled input!
+ }
+
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+ auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+ unsigned ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1u << i))
+ for (int j = 0; j != Scale; ++j)
+ ScaledMask |= 1u << (i * Scale + j);
+ return ScaledMask;
+ };
+
+ switch (VT.SimpleTy) {
+ case MVT::v2f64:
+ case MVT::v4f32:
+ case MVT::v4f64:
+ case MVT::v8f32:
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
+
+ case MVT::v4i64:
+ case MVT::v8i32:
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ // FALLTHROUGH
+ case MVT::v2i64:
+ case MVT::v4i32:
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget->hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
+ // FALLTHROUGH
+ case MVT::v8i16: {
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = DAG.getBitcast(MVT::v8i16, V2);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
+
+ case MVT::v16i16: {
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+ assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+ BlendMask = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 16)
+ BlendMask |= 1u << i;
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
+ }
+ }
+ // FALLTHROUGH
+ case MVT::v16i8:
+ case MVT::v32i8: {
+ assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
+
+ // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Scale the blend by the number of bytes per element.
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+
+ // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+ // mix of LLVM's code generator and the x86 backend. We tell the code
+ // generator that boolean values in the elements of an x86 vector register
+ // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+ // mapping a select to operand #1, and 'false' mapping to operand #2. The
+ // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+ // of the element (the remaining are ignored) and 0 in that high bit would
+ // mean operand #1 while 1 in the high bit would mean operand #2. So while
+ // the LLVM model for boolean values in vector elements gets the relevant
+ // bit set, it is set backwards and over constrained relative to x86's
+ // actual model.
+ SmallVector<SDValue, 32> VSELECTMask;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ for (int j = 0; j < Scale; ++j)
+ VSELECTMask.push_back(
+ Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
+ MVT::i8));
+
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ BlendVT, VSELECTMask),
+ V1, V2));
+ }
+
+ default:
+ llvm_unreachable("Not a supported integer vector type!");
+ }
+}
+
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] == -1)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
+ SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // blend them together.
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] < Size) {
+ V1Mask[i] = Mask[i];
+ BlendMask[i] = i;
+ } else if (Mask[i] >= Size) {
+ V2Mask[i] = Mask[i] - Size;
+ BlendMask[i] = i + Size;
+ }
+
+ // Try to lower with the simpler initial blend strategy unless one of the
+ // input shuffles would be a no-op. We prefer to shuffle inputs as the
+ // shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, blending
+ // first is a better strategy.
+ if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
+ // We need to detect various ways of spelling a rotation:
+ // [11, 12, 13, 14, 15, 0, 1, 2]
+ // [-1, 12, 13, 14, -1, -1, 1, -1]
+ // [-1, -1, -1, -1, -1, -1, 1, 2]
+ // [ 3, 4, 5, 6, 7, 8, 9, 10]
+ // [-1, 4, 5, 6, -1, -1, 9, -1]
+ // [-1, 4, 5, 6, -1, -1, -1, -1]
+ int Rotation = 0;
+ SDValue Lo, Hi;
+ for (int l = 0; l < NumElts; l += NumLaneElts) {
+ for (int i = 0; i < NumLaneElts; ++i) {
+ if (Mask[l + i] == -1)
+ continue;
+ assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
+
+ // Get the mod-Size index and lane correct it.
+ int LaneIdx = (Mask[l + i] % NumElts) - l;
+ // Make sure it was in this lane.
+ if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
+ return SDValue();
+
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - LaneIdx;
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return SDValue();
+
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
+
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return SDValue();
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return SDValue();
+ }
+ }
+
+ // Check that we successfully analyzed the mask, and normalize the results.
+ assert(Rotation != 0 && "Failed to locate a viable rotation!");
+ assert((Lo || Hi) && "Failed to find a rotated input vector!");
+ if (!Lo)
+ Lo = Hi;
+ else if (!Hi)
+ Hi = Lo;
+
+ // The actual rotate instruction rotates bytes, so we need to scale the
+ // rotation based on how many bytes are in the vector lane.
+ int Scale = 16 / NumLaneElts;
+
+ // SSSE3 targets can use the palignr instruction.
+ if (Subtarget->hasSSSE3()) {
+ // Cast the inputs to i8 vector of correct length to match PALIGNR.
+ MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+ Lo = DAG.getBitcast(AlignVT, Lo);
+ Hi = DAG.getBitcast(AlignVT, Hi);
+
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
+ DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
+ }
+
+ assert(VT.is128BitVector() &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+
+ // Default SSE2 implementation
+ int LoByteShift = 16 - Rotation * Scale;
+ int HiByteShift = Rotation * Scale;
+
+ // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
+ Lo = DAG.getBitcast(MVT::v2i64, Lo);
+ Hi = DAG.getBitcast(MVT::v2i64, Hi);
+
+ SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+ DAG.getConstant(LoByteShift, DL, MVT::i8));
+ SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+ DAG.getConstant(HiByteShift, DL, MVT::i8));
+ return DAG.getBitcast(VT,
+ DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+}
+
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz, 2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [ 1, zz, 3, zz]
+/// [ -1, -1, 7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz, 0, 1, 2, 3, 4, 5, 6]
+/// [ zz, zz, -1, -1, 2, 3, 4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1, 1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [ 5, 6, 7, zz, zz, zz, zz, zz]
+/// [ -1, 5, 6, 7, zz, zz, zz, zz]
+/// [ 1, 2, -1, -1, -1, -1, zz, zz]
+static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int Size = Mask.size();
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
+
+ return true;
+ };
+
+ auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+ Low + (V == V1 ? 0 : Size)))
+ return SDValue();
+ }
+
+ int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
+
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
+
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+
+ V = DAG.getNode(OpCode, DL, ShiftVT, V,
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, V);
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left))
+ for (SDValue V : {V1, V2})
+ if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+ return Match;
+
+ // no match
+ return SDValue();
+}
+
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ // Upper half must be undefined.
+ if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ return SDValue();
+
+ // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+ // Remainder of lower half result is zero and upper half is all undef.
+ auto LowerAsEXTRQ = [&]() {
+ // Determine the extraction length from the part of the
+ // lower half that isn't zeroable.
+ int Len = HalfSize;
+ for (; Len > 0; --Len)
+ if (!Zeroable[Len - 1])
+ break;
+ assert(Len > 0 && "Zeroable shuffle mask");
+
+ // Attempt to match first Len sequential elements from the lower half.
+ SDValue Src;
+ int Idx = -1;
+ for (int i = 0; i != Len; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ SDValue &V = (M < Size ? V1 : V2);
+ M = M % Size;
+
+ // The extracted elements must start at a valid index and all mask
+ // elements must be in the lower half.
+ if (i > M || M >= HalfSize)
+ return SDValue();
+
+ if (Idx < 0 || (Src == V && Idx == (M - i))) {
+ Src = V;
+ Idx = M - i;
+ continue;
+ }
+ return SDValue();
+ }
+
+ if (Idx < 0)
+ return SDValue();
+
+ assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+ int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ };
+
+ if (SDValue ExtrQ = LowerAsEXTRQ())
+ return ExtrQ;
+
+ // INSERTQ: Extract lowest Len elements from lower half of second source and
+ // insert over first source, starting at Idx.
+ // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+ auto LowerAsInsertQ = [&]() {
+ for (int Idx = 0; Idx != HalfSize; ++Idx) {
+ SDValue Base;
+
+ // Attempt to match first source from mask before insertion point.
+ if (isUndefInRange(Mask, 0, Idx)) {
+ /* EMPTY */
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ Base = V1;
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ Base = V2;
+ } else {
+ continue;
+ }
+
+ // Extend the extraction length looking to match both the insertion of
+ // the second source and the remaining elements of the first.
+ for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+ SDValue Insert;
+ int Len = Hi - Idx;
+
+ // Match insertion.
+ if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+ Insert = V1;
+ } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+ Insert = V2;
+ } else {
+ continue;
+ }
+
+ // Match the remaining elements of the lower half.
+ if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+ /* EMPTY */
+ } else if ((!Base || (Base == V1)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+ Base = V1;
+ } else if ((!Base || (Base == V2)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+ Size + Hi)) {
+ Base = V2;
+ } else {
+ continue;
+ }
+
+ // We may not have a base (first source) - this can safely be undefined.
+ if (!Base)
+ Base = DAG.getUNDEF(VT);
+
+ int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ }
+ }
+
+ return SDValue();
+ };
+
+ if (SDValue InsertQ = LowerAsInsertQ())
+ return InsertQ;
+
+ return SDValue();
+}
+
+/// \brief Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
+static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+ SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+ ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(Scale > 1 && "Need a scale to extend.");
+ int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = 128 / EltBits;
+ int OffsetLane = Offset / NumEltsPerLane;
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Only 8, 16, and 32 bit elements can be extended.");
+ assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+ assert(0 <= Offset && "Extension offset must be positive.");
+ assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+ "Extension offset must be in the first lane or start an upper lane.");
+
+ // Check that an index is in same lane as the base offset.
+ auto SafeOffset = [&](int Idx) {
+ return OffsetLane == (Idx / NumEltsPerLane);
+ };
+
+ // Shift along an input so that the offset base moves to the first element.
+ auto ShuffleOffset = [&](SDValue V) {
+ if (!Offset)
+ return V;
+
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = 0; i * Scale < NumElements; ++i) {
+ int SrcIdx = i + Offset;
+ ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+ }
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+ };
+
+ // Found a valid zext mask! Try various lowering strategies based on the
+ // input type and available ISA extensions.
+ if (Subtarget->hasSSE41()) {
+ // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+ // PUNPCK will catch this in a later shuffle match.
+ if (Offset && Scale == 2 && VT.is128BitVector())
+ return SDValue();
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+ NumElements / Scale);
+ InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+ return DAG.getBitcast(VT, InputV);
+ }
+
+ assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
+ // For any extends we can cheat for larger element sizes and use shuffle
+ // instructions that can fold with a load and/or copy.
+ if (AnyExt && EltBits == 32) {
+ int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+ -1};
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+ if (AnyExt && EltBits == 16 && Scale > 2) {
+ int PSHUFDMask[4] = {Offset / 2, -1,
+ SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
+ InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+ int PSHUFWMask[4] = {1, -1, -1, -1};
+ unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+ return DAG.getBitcast(
+ VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v8i16, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
+ }
+
+ // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+ // to 64-bits.
+ if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+ assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+ assert(VT.is128BitVector() && "Unexpected vector width!");
+
+ int LoIdx = Offset * EltBits;
+ SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+ if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+ !SafeOffset(Offset + 1))
+ return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+ int HiIdx = (Offset + 1) * EltBits;
+ SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(HiIdx, DL, MVT::i8)));
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+ }
+
+ // If this would require more than 2 unpack instructions to expand, use
+ // pshufb when available. We can only use more than 2 unpack instructions
+ // when zero extending i8 elements which also makes it easier to use pshufb.
+ if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+ assert(NumElements == 16 && "Unexpected byte vector width!");
+ SDValue PSHUFBMask[16];
+ for (int i = 0; i < 16; ++i) {
+ int Idx = Offset + (i / Scale);
+ PSHUFBMask[i] = DAG.getConstant(
+ (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+ }
+ InputV = DAG.getBitcast(MVT::v16i8, InputV);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v16i8, PSHUFBMask)));
+ }
+
+ // If we are extending from an offset, ensure we start on a boundary that
+ // we can unpack from.
+ int AlignToUnpack = Offset % (NumElements / Scale);
+ if (AlignToUnpack) {
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = AlignToUnpack; i < NumElements; ++i)
+ ShMask[i - AlignToUnpack] = i;
+ InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+ Offset -= AlignToUnpack;
+ }
+
+ // Otherwise emit a sequence of unpacks.
+ do {
+ unsigned UnpackLoHi = X86ISD::UNPCKL;
+ if (Offset >= (NumElements / 2)) {
+ UnpackLoHi = X86ISD::UNPCKH;
+ Offset -= (NumElements / 2);
+ }
+
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+ : getZeroVector(InputVT, Subtarget, DAG, DL);
+ InputV = DAG.getBitcast(InputVT, InputV);
+ InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
+ Scale /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Scale > 1);
+ return DAG.getBitcast(VT, InputV);
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering, it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int Bits = VT.getSizeInBits();
+ int NumLanes = Bits / 128;
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElements / NumLanes;
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
+
+ // Define a helper function to check a particular ext-scale and lower to it if
+ // valid.
+ auto Lower = [&](int Scale) -> SDValue {
+ SDValue InputV;
+ bool AnyExt = true;
+ int Offset = 0;
+ int Matches = 0;
+ for (int i = 0; i < NumElements; ++i) {
+ int M = Mask[i];
+ if (M == -1)
+ continue; // Valid anywhere but doesn't tell us anything.
+ if (i % Scale != 0) {
+ // Each of the extended elements need to be zeroable.
+ if (!Zeroable[i])
+ return SDValue();
+
+ // We no longer are in the anyext case.
+ AnyExt = false;
+ continue;
+ }
+
+ // Each of the base elements needs to be consecutive indices into the
+ // same input vector.
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
+ InputV = V;
+ Offset = M - (i / Scale);
+ } else if (InputV != V)
+ return SDValue(); // Flip-flopping inputs.
+
+ // Offset must start in the lowest 128-bit lane or at the start of an
+ // upper lane.
+ // FIXME: Is it ever worth allowing a negative base offset?
+ if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+ (Offset % NumEltsPerLane) == 0))
+ return SDValue();
+
+ // If we are offsetting, all referenced entries must come from the same
+ // lane.
+ if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+ return SDValue();
+
+ if ((M % NumElements) != (Offset + (i / Scale)))
+ return SDValue(); // Non-consecutive strided elements.
+ Matches++;
+ }
+
+ // If we fail to find an input, we have a zero-shuffle which should always
+ // have already been handled.
+ // FIXME: Maybe handle this here in case during blending we end up with one?
+ if (!InputV)
+ return SDValue();
+
+ // If we are offsetting, don't extend if we only match a single input, we
+ // can always do better by using a basic PSHUF or PUNPCK.
+ if (Offset != 0 && Matches < 2)
+ return SDValue();
+
+ return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+ DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+ };
+
+ // The widest scale possible for extending is to a 64-bit integer.
+ assert(Bits % 64 == 0 &&
+ "The number of bits in a vector must be divisible by 64 on x86!");
+ int NumExtElements = Bits / 64;
+
+ // Each iteration, try extending the elements half as much, but into twice as
+ // many elements.
+ for (; NumExtElements < NumElements; NumExtElements *= 2) {
+ assert(NumElements % NumExtElements == 0 &&
+ "The input vector size must be divisible by the extended size.");
+ if (SDValue V = Lower(NumElements / NumExtElements))
+ return V;
+ }
+
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; ++i)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getBitcast(MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getBitcast(VT, V);
+ }
+
+ // No viable ext lowering found.
+ return SDValue();
+}
+
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+ SelectionDAG &DAG) {
+ MVT VT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+ // If the bitcasts shift the element size, we can't extract an equivalent
+ // element from it.
+ MVT NewVT = V.getSimpleValueType();
+ if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
+ // Ensure the scalar operand is the same size as the destination.
+ // FIXME: Add support for scalar truncation where possible.
+ SDValue S = V.getOperand(Idx);
+ if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
+ return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
+ }
+
+ return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ return ISD::isNON_EXTLoad(V.getNode());
+}
+
+/// \brief Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerVectorShuffleAsElementInsertion(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ MVT ExtVT = VT;
+ MVT EltVT = VT.getVectorElementType();
+
+ int V2Index = std::find_if(Mask.begin(), Mask.end(),
+ [&Mask](int M) { return M >= (int)Mask.size(); }) -
+ Mask.begin();
+ bool IsV1Zeroable = true;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (i != V2Index && !Zeroable[i]) {
+ IsV1Zeroable = false;
+ break;
+ }
+
+ // Check for a single input from a SCALAR_TO_VECTOR node.
+ // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+ // all the smarts here sunk into that routine. However, the current
+ // lowering of BUILD_VECTOR makes that nearly impossible until the old
+ // vector shuffle lowering is dead.
+ SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+ DAG);
+ if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
+ // We need to zext the scalar if it is smaller than an i32.
+ V2S = DAG.getBitcast(EltVT, V2S);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+ // Using zext to expand a narrow element won't work for non-zero
+ // insertions.
+ if (!IsV1Zeroable)
+ return SDValue();
+
+ // Zero-extend directly to i32.
+ ExtVT = MVT::v4i32;
+ V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ }
+ V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+ } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+ EltVT == MVT::i16) {
+ // Either not inserting from the low element of the input or the input
+ // element size is too small to use VZEXT_MOVL to clear the high bits.
+ return SDValue();
+ }
+
+ if (!IsV1Zeroable) {
+ // If V1 can't be treated as a zero vector we have fewer options to lower
+ // this. We can't support integer vectors or non-zero targets cheaply, and
+ // the V1 elements can't be permuted in any way.
+ assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+ if (!VT.isFloatingPoint() || V2Index != 0)
+ return SDValue();
+ SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+ V1Mask[V2Index] = -1;
+ if (!isNoopShuffleMask(V1Mask))
+ return SDValue();
+ // This is essentially a special case blend operation, but if we have
+ // general purpose blend operations, they are always faster. Bail and let
+ // the rest of the lowering handle these as blends.
+ if (Subtarget->hasSSE41())
+ return SDValue();
+
+ // Otherwise, use MOVSD or MOVSS.
+ assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+ "Only two types of floating point element types to handle!");
+ return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+ ExtVT, V1, V2);
+ }
+
+ // This lowering only works for the low element with floating point vectors.
+ if (VT.isFloatingPoint() && V2Index != 0)
+ return SDValue();
+
+ V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
+ if (ExtVT != VT)
+ V2 = DAG.getBitcast(VT, V2);
+
+ if (V2Index != 0) {
+ // If we have 4 or fewer lanes we can cheaply shuffle the element into
+ // the desired position. Otherwise it is more efficient to do a vector
+ // shift left. We know that we can do a vector shift left because all
+ // the inputs are zero.
+ if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+ SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+ V2Shuffle[V2Index] = 0;
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+ } else {
+ V2 = DAG.getBitcast(MVT::v2i64, V2);
+ V2 = DAG.getNode(
+ X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+ DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+ DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+ DAG.getDataLayout(), VT)));
+ V2 = DAG.getBitcast(VT, V2);
+ }
+ }
+ return V2;
+}
+
+/// \brief Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
+ int BroadcastIdx,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->hasAVX2() &&
+ "We can only lower integer broadcasts with AVX2!");
+
+ EVT EltVT = VT.getVectorElementType();
+ EVT V0VT = V0.getValueType();
+
+ assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+ assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+ EVT V0EltVT = V0VT.getVectorElementType();
+ if (!V0EltVT.isInteger())
+ return SDValue();
+
+ const unsigned EltSize = EltVT.getSizeInBits();
+ const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+ // This is only a truncation if the original element type is larger.
+ if (V0EltSize <= EltSize)
+ return SDValue();
+
+ assert(((V0EltSize % EltSize) == 0) &&
+ "Scalar type sizes must all be powers of 2 on x86!");
+
+ const unsigned V0Opc = V0.getOpcode();
+ const unsigned Scale = V0EltSize / EltSize;
+ const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+ if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+ V0Opc != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+ // If we're extracting non-least-significant bits, shift so we can truncate.
+ // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+ // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+ // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+ if (const int OffsetIdx = BroadcastIdx % Scale)
+ Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+ DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
+ ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ if (VT.isInteger() && !Subtarget->hasAVX2())
+ return SDValue();
+
+ // Check that the mask is a broadcast.
+ int BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= 0 && BroadcastIdx == -1)
+ BroadcastIdx = M;
+ else if (M >= 0 && M != BroadcastIdx)
+ return SDValue();
+
+ assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+ "a sorted mask where the broadcast "
+ "comes from V1.");
+
+ // Go up the chain of (vector) values to find a scalar load that we can
+ // combine with the broadcast.
+ for (;;) {
+ switch (V.getOpcode()) {
+ case ISD::CONCAT_VECTORS: {
+ int OperandSize = Mask.size() / V.getNumOperands();
+ V = V.getOperand(BroadcastIdx / OperandSize);
+ BroadcastIdx %= OperandSize;
+ continue;
+ }
+
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+ auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+ if (!ConstantIdx)
+ break;
+
+ int BeginIdx = (int)ConstantIdx->getZExtValue();
+ int EndIdx =
+ BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
+ if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+ BroadcastIdx -= BeginIdx;
+ V = VInner;
+ } else {
+ V = VOuter;
+ }
+ continue;
+ }
+ }
+ break;
+ }
+
+ // Check if this is a broadcast of a scalar. We special case lowering
+ // for scalars so that we can more effectively fold with loads.
+ // First, look through bitcast: if the original value has a larger element
+ // type than the shuffle, the broadcast element is in essence truncated.
+ // Make that explicit to ease folding.
+ if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
+ DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+ return TruncBroadcast;
+
+ // Also check the simpler case, where we can directly reuse the scalar.
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ V = V.getOperand(BroadcastIdx);
+
+ // If the scalar isn't a load, we can't broadcast from it in AVX1.
+ // Only AVX2 has register broadcasts.
+ if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+ return SDValue();
+ } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+ // If we are broadcasting a load that is only used by the shuffle
+ // then we can reduce the vector load to the broadcasted scalar load.
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
+ SDValue BaseAddr = Ld->getOperand(1);
+ EVT AddrVT = BaseAddr.getValueType();
+ EVT SVT = VT.getScalarType();
+ unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, DL, AddrVT, BaseAddr,
+ DAG.getConstant(Offset, DL, AddrVT));
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
+ // We can't broadcast from a vector register without AVX2, and we can only
+ // broadcast from the zero-element of a vector register.
+ return SDValue();
+ }
+
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+}
+
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+ return M >= 0 && M % Size < Size / 2;
+ });
+ int NumHiInputs = std::count_if(
+ Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // If we will have to shuffle both inputs to use the unpack, check whether
+ // we can just unpack first and shuffle the result. If so, skip this unpack.
+ if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+ !isNoopShuffleMask(V2Mask))
+ return SDValue();
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ V1 = DAG.getBitcast(UnpackVT, V1);
+ V2 = DAG.getBitcast(UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getBitcast(
+ VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigNumElements = VT.getVectorNumElements();
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+ int Scale = ScalarSize / OrigScalarSize;
+ int NumElements = OrigNumElements / Scale;
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+ if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+ return Unpack;
+ }
+
+ // If none of the unpack-rooted lowerings worked (or were profitable) try an
+ // initial unpack.
+ if (NumLoInputs == 0 || NumHiInputs == 0) {
+ assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+ "We have to have *some* inputs!");
+ int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+ // FIXME: We could consider the total complexity of the permute of each
+ // possible unpacking. Or at the least we should consider how many
+ // half-crossings are created.
+ // FIXME: We could consider commuting the unpacks.
+
+ SmallVector<int, 32> PermMask;
+ PermMask.assign(Size, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+ PermMask[i] =
+ 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+ }
+ return DAG.getVectorShuffle(
+ VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+ DL, VT, V1, V2),
+ DAG.getUNDEF(VT), PermMask);
+ }
+
+ return SDValue();
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3())
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
+ // Straight shuffle of a single input vector. Simulate this by using the
+ // single input as both of the "inputs" to this instruction..
+ unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+ if (Subtarget->hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ }
+
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ }
+ assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+ assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+ // If we have a single input, insert that into V1 if we can do so cheaply.
+ if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+ }
+
+ // Try to use one of the special instruction patterns to handle two common
+ // blend patterns if a zero-blend above didn't work.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+ if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+ // We can either use a special instruction to load over the low double or
+ // to move just the low double.
+ return DAG.getNode(
+ isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+ DL, MVT::v2f64, V2,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
+
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ return V;
+
+ unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We have to map the mask as it is actually a v4i32 shuffle instruction.
+ V1 = DAG.getBitcast(MVT::v4i32, V1);
+ int WidenedMask[4] = {
+ std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+ std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+ return DAG.getBitcast(
+ MVT::v2i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
+ }
+ assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ // If we have a blend of two PACKUS operations an the blend aligns with the
+ // low and half halves, we can just merge the PACKUS operations. This is
+ // particularly important as it lets us merge shuffles that this routine itself
+ // creates.
+ auto GetPackNode = [](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+ };
+ if (SDValue V1Pack = GetPackNode(V1))
+ if (SDValue V2Pack = GetPackNode(V2))
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+ Mask, DAG);
+
+ // We implement this with SHUFPD which is pretty lame because it will likely
+ // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+ // However, all the alternatives are still more cycles and newer chips don't
+ // have this problem. It would be really nice if x86 had better shuffles here.
+ V1 = DAG.getBitcast(MVT::v2f64, V1);
+ V2 = DAG.getBitcast(MVT::v2f64, V2);
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
+///
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SDValue LowV = V1, HighV = V2;
+ int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 1) {
+ int V2Index =
+ std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+ Mask.begin();
+
+ // Compute the index adjacent to V2Index and in the same half by toggling
+ // the low bit.
+ int V2AdjIndex = V2Index ^ 1;
+
+ if (Mask[V2AdjIndex] == -1) {
+ // Handles all the cases where we have a single V2 element and an undef.
+ // This will only ever happen in the high lanes because we commute the
+ // vector otherwise.
+ if (V2Index < 2)
+ std::swap(LowV, HighV);
+ NewMask[V2Index] -= 4;
+ } else {
+ // Handle the case where the V2 element ends up adjacent to a V1 element.
+ // To make this work, blend them together as the first step.
+ int V1Index = V2AdjIndex;
+ int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+ V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+ // Now proceed to reconstruct the final blend as we have the necessary
+ // high or low half formed.
+ if (V2Index < 2) {
+ LowV = V2;
+ HighV = V1;
+ } else {
+ HighV = V2;
+ }
+ NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+ NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+ }
+ } else if (NumV2Elements == 2) {
+ if (Mask[0] < 4 && Mask[1] < 4) {
+ // Handle the easy case where we have V1 in the low lanes and V2 in the
+ // high lanes.
+ NewMask[2] -= 4;
+ NewMask[3] -= 4;
+ } else if (Mask[2] < 4 && Mask[3] < 4) {
+ // We also handle the reversed case because this utility may get called
+ // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+ // arrange things in the right direction.
+ NewMask[0] -= 4;
+ NewMask[1] -= 4;
+ HighV = V1;
+ LowV = V2;
+ } else {
+ // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+ // trying to place elements directly, just blend them and set up the final
+ // shuffle to place them.
+
+ // The first two blend mask elements are for V1, the second two are for
+ // V2.
+ int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+ Mask[2] < 4 ? Mask[2] : Mask[3],
+ (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+ (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+ V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+ // Now we do a normal shuffle of V1 by giving V1 as both operands to
+ // a blend.
+ LowV = HighV = V1;
+ NewMask[0] = Mask[0] < 4 ? 0 : 2;
+ NewMask[1] = Mask[0] < 4 ? 2 : 0;
+ NewMask[2] = Mask[2] < 4 ? 1 : 3;
+ NewMask[3] = Mask[2] < 4 ? 3 : 1;
+ }
+ }
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
+ getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3()) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
+ if (Subtarget->hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Otherwise, use a straight shuffle of a single input vector. We pass the
+ // input vector to both operands to simulate this with a SHUFPS.
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // There are special ways we can lower some single-element blends. However, we
+ // have custom ways we can lower more complex single-element blends below that
+ // we defer to if both this and BLENDPS fail to match, so restrict this to
+ // when the V2 input is targeting element 0 of the mask -- that is the fast
+ // case here.
+ if (NumV2Elements == 1 && Mask[0] >= 4)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
+ Mask, Subtarget, DAG))
+ return V;
+
+ if (Subtarget->hasSSE41()) {
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
+
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return BlendPerm;
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ return V;
+
+ // Otherwise fall back to a SHUFPS lowering strategy.
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We coerce the shuffle pattern to be compatible with UNPCK instructions
+ // but we aren't actually going to use the UNPCK instruction because doing
+ // so prevents folding a load into this instruction or making a copy.
+ const int UnpackLoMask[] = {0, 0, 1, 1};
+ const int UnpackHiMask[] = {2, 2, 3, 3};
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
+ Mask = UnpackLoMask;
+ else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
+ Mask = UnpackHiMask;
+
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Shift;
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return V;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+ Mask, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
+ V2, Mask, DAG))
+ return Unpack;
+
+ // We implement this with SHUFPS because it can blend from two vectors.
+ // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+ // up the inputs, bypassing domain shift penalties that we would encur if we
+ // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+ // relevant.
+ return DAG.getBitcast(
+ MVT::v4i32,
+ DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
+ DAG.getBitcast(MVT::v4f32, V2), Mask));
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+ SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+ MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+ MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 0; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+ int NumLToL =
+ std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+ int NumHToL = LoInputs.size() - NumLToL;
+ int NumLToH =
+ std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+ int NumHToH = HiInputs.size() - NumLToH;
+ MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+ MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+ MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+ MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+ // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+ // such inputs we can swap two of the dwords across the half mark and end up
+ // with <=2 inputs to each half in each half. Once there, we can fall through
+ // to the generic code below. For example:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+ //
+ // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
+ // and an existing 2-into-2 on the other half. In this case we may have to
+ // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
+ // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
+ // Fortunately, we don't have to handle anything but a 2-into-2 pattern
+ // because any other situation (including a 3-into-1 or 1-into-3 in the other
+ // half than the one we target for fixing) will be fixed when we re-enter this
+ // path. We will also combine away any sequence of PSHUFD instructions that
+ // result into a single instruction. Here is an example of the tricky case:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
+ //
+ // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
+ //
+ // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
+ //
+ // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
+ //
+ // The result is fine to be handled by the generic logic.
+ auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
+ ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
+ int AOffset, int BOffset) {
+ assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
+ "Must call this with A having 3 or 1 inputs from the A half.");
+ assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
+ "Must call this with B having 1 or 3 inputs from the B half.");
+ assert(AToAInputs.size() + BToAInputs.size() == 4 &&
+ "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+
+ bool ThreeAInputs = AToAInputs.size() == 3;
+
+ // Compute the index of dword with only one word among the three inputs in
+ // a half by taking the sum of the half with three inputs and subtracting
+ // the sum of the actual three inputs. The difference is the remaining
+ // slot.
+ int ADWord, BDWord;
+ int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+ int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+ int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+ ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+ int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
+ int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
+ int TripleNonInputIdx =
+ TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+ TripleDWord = TripleNonInputIdx / 2;
+
+ // We use xor with one to compute the adjacent DWord to whichever one the
+ // OneInput is in.
+ OneInputDWord = (OneInput / 2) ^ 1;
+
+ // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
+ // and BToA inputs. If there is also such a problem with the BToB and AToB
+ // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
+ // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
+ // is essential that we don't *create* a 3<-1 as then we might oscillate.
+ if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
+ // Compute how many inputs will be flipped by swapping these DWords. We
+ // need
+ // to balance this to ensure we don't form a 3-1 shuffle in the other
+ // half.
+ int NumFlippedAToBInputs =
+ std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
+ std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
+ int NumFlippedBToBInputs =
+ std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
+ std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+ if ((NumFlippedAToBInputs == 1 &&
+ (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
+ (NumFlippedBToBInputs == 1 &&
+ (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
+ // We choose whether to fix the A half or B half based on whether that
+ // half has zero flipped inputs. At zero, we may not be able to fix it
+ // with that half. We also bias towards fixing the B half because that
+ // will more commonly be the high half, and we have to bias one way.
+ auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
+ ArrayRef<int> Inputs) {
+ int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
+ bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
+ PinnedIdx ^ 1) != Inputs.end();
+ // Determine whether the free index is in the flipped dword or the
+ // unflipped dword based on where the pinned index is. We use this bit
+ // in an xor to conditionally select the adjacent dword.
+ int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
+ bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
+ FixFreeIdx) != Inputs.end();
+ if (IsFixIdxInput == IsFixFreeIdxInput)
+ FixFreeIdx += 1;
+ IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
+ FixFreeIdx) != Inputs.end();
+ assert(IsFixIdxInput != IsFixFreeIdxInput &&
+ "We need to be changing the number of flipped inputs!");
+ int PSHUFHalfMask[] = {0, 1, 2, 3};
+ std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
+ V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+ MVT::v8i16, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+
+ for (int &M : Mask)
+ if (M != -1 && M == FixIdx)
+ M = FixFreeIdx;
+ else if (M != -1 && M == FixFreeIdx)
+ M = FixIdx;
+ };
+ if (NumFlippedBToBInputs != 0) {
+ int BPinnedIdx =
+ BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+ FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
+ } else {
+ assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
+ int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
+ FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
+ }
+ }
+ }
+
+ int PSHUFDMask[] = {0, 1, 2, 3};
+ PSHUFDMask[ADWord] = BDWord;
+ PSHUFDMask[BDWord] = ADWord;
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+ // Adjust the mask to match the new locations of A and B.
+ for (int &M : Mask)
+ if (M != -1 && M/2 == ADWord)
+ M = 2 * BDWord + M % 2;
+ else if (M != -1 && M/2 == BDWord)
+ M = 2 * ADWord + M % 2;
+
+ // Recurse back into this routine to re-compute state now that this isn't
+ // a 3 and 1 problem.
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+ DAG);
+ };
+ if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
+ return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
+ else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+ return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
+
+ // At this point there are at most two inputs to the low and high halves from
+ // each half. That means the inputs can always be grouped into dwords and
+ // those dwords can then be moved to the correct half with a dword shuffle.
+ // We use at most one low and one high word shuffle to collect these paired
+ // inputs into dwords, and finally a dword shuffle to place them.
+ int PSHUFLMask[4] = {-1, -1, -1, -1};
+ int PSHUFHMask[4] = {-1, -1, -1, -1};
+ int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+ // First fix the masks for all the inputs that are staying in their
+ // original halves. This will then dictate the targets of the cross-half
+ // shuffles.
+ auto fixInPlaceInputs =
+ [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+ MutableArrayRef<int> SourceHalfMask,
+ MutableArrayRef<int> HalfMask, int HalfOffset) {
+ if (InPlaceInputs.empty())
+ return;
+ if (InPlaceInputs.size() == 1) {
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+ return;
+ }
+ if (IncomingInputs.empty()) {
+ // Just fix all of the in place inputs.
+ for (int Input : InPlaceInputs) {
+ SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+ PSHUFDMask[Input / 2] = Input / 2;
+ }
+ return;
+ }
+
+ assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ // Put the second input next to the first so that they are packed into
+ // a dword. We find the adjacent index by toggling the low bit.
+ int AdjIndex = InPlaceInputs[0] ^ 1;
+ SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+ PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+ };
+ fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+ fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
+
+ // Now gather the cross-half inputs and place them into a free dword of
+ // their target half.
+ // FIXME: This operation could almost certainly be simplified dramatically to
+ // look more like the 3-1 fixing operation.
+ auto moveInputsToRightHalf = [&PSHUFDMask](
+ MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+ MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+ MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+ int DestOffset) {
+ auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+ return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+ };
+ auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+ int Word) {
+ int LowWord = Word & ~1;
+ int HighWord = Word | 1;
+ return isWordClobbered(SourceHalfMask, LowWord) ||
+ isWordClobbered(SourceHalfMask, HighWord);
+ };
+
+ if (IncomingInputs.empty())
+ return;
+
+ if (ExistingInputs.empty()) {
+ // Map any dwords with inputs from them into the right half.
+ for (int Input : IncomingInputs) {
+ // If the source half mask maps over the inputs, turn those into
+ // swaps and use the swapped lane.
+ if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+ if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+ SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+ Input - SourceOffset;
+ // We have to swap the uses in our half mask in one sweep.
+ for (int &M : HalfMask)
+ if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
+ M = Input;
+ else if (M == Input)
+ M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ } else {
+ assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+ Input - SourceOffset &&
+ "Previous placement doesn't match!");
+ }
+ // Note that this correctly re-maps both when we do a swap and when
+ // we observe the other side of the swap above. We rely on that to
+ // avoid swapping the members of the input list directly.
+ Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ }
+
+ // Map the input's dword into the correct half.
+ if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+ PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+ else
+ assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+ Input / 2 &&
+ "Previous placement doesn't match!");
+ }
+
+ // And just directly shift any other-half mask elements to be same-half
+ // as we will have mirrored the dword containing the element into the
+ // same position within that half.
+ for (int &M : HalfMask)
+ if (M >= SourceOffset && M < SourceOffset + 4) {
+ M = M - SourceOffset + DestOffset;
+ assert(M >= 0 && "This should never wrap below zero!");
+ }
+ return;
+ }
+
+ // Ensure we have the input in a viable dword of its current half. This
+ // is particularly tricky because the original position may be clobbered
+ // by inputs being moved and *staying* in that half.
+ if (IncomingInputs.size() == 1) {
+ if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ int InputFixed = std::find(std::begin(SourceHalfMask),
+ std::end(SourceHalfMask), -1) -
+ std::begin(SourceHalfMask) + SourceOffset;
+ SourceHalfMask[InputFixed - SourceOffset] =
+ IncomingInputs[0] - SourceOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+ InputFixed);
+ IncomingInputs[0] = InputFixed;
+ }
+ } else if (IncomingInputs.size() == 2) {
+ if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+ isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ // We have two non-adjacent or clobbered inputs we need to extract from
+ // the source half. To do this, we need to map them into some adjacent
+ // dword slot in the source mask.
+ int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+ IncomingInputs[1] - SourceOffset};
+
+ // If there is a free slot in the source half mask adjacent to one of
+ // the inputs, place the other input in it. We use (Index XOR 1) to
+ // compute an adjacent index.
+ if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+ SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+ SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+ SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+ InputsFixed[0] = InputsFixed[1] ^ 1;
+ } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+ // The two inputs are in the same DWord but it is clobbered and the
+ // adjacent DWord isn't used at all. Move both inputs to the free
+ // slot.
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+ InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+ InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+ } else {
+ // The only way we hit this point is if there is no clobbering
+ // (because there are no off-half inputs to this half) and there is no
+ // free slot adjacent to one of the inputs. In this case, we have to
+ // swap an input with a non-input.
+ for (int i = 0; i < 4; ++i)
+ assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+ "We can't handle any clobbers here!");
+ assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+ "Cannot have adjacent inputs here!");
+
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+ // We also have to update the final source mask in this case because
+ // it may need to undo the above swap.
+ for (int &M : FinalSourceHalfMask)
+ if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+ M = InputsFixed[1] + SourceOffset;
+ else if (M == InputsFixed[1] + SourceOffset)
+ M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ }
+
+ // Point everything at the fixed inputs.
+ for (int &M : HalfMask)
+ if (M == IncomingInputs[0])
+ M = InputsFixed[0] + SourceOffset;
+ else if (M == IncomingInputs[1])
+ M = InputsFixed[1] + SourceOffset;
+
+ IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+ IncomingInputs[1] = InputsFixed[1] + SourceOffset;
+ }
+ } else {
+ llvm_unreachable("Unhandled input size!");
+ }
+
+ // Now hoist the DWord down to the right half.
+ int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
+ assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+ PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+ for (int &M : HalfMask)
+ for (int Input : IncomingInputs)
+ if (M == Input)
+ M = FreeDWord * 2 + Input % 2;
+ };
+ moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
+ /*SourceOffset*/ 4, /*DestOffset*/ 0);
+ moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
+ /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+ // Now enact all the shuffles we've computed to move the inputs into their
+ // target half.
+ if (!isNoopShuffleMask(PSHUFLMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
+ if (!isNoopShuffleMask(PSHUFHMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
+ if (!isNoopShuffleMask(PSHUFDMask))
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+ // At this point, each half should contain all its inputs, and we can then
+ // just shuffle them into their final position.
+ assert(std::count_if(LoMask.begin(), LoMask.end(),
+ [](int M) { return M >= 4; }) == 0 &&
+ "Failed to lift all the high half inputs to the low mask!");
+ assert(std::count_if(HiMask.begin(), HiMask.end(),
+ [](int M) { return M >= 0 && M < 4; }) == 0 &&
+ "Failed to lift all the low half inputs to the high mask!");
+
+ // Do a half shuffle for the low mask.
+ if (!isNoopShuffleMask(LoMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+
+ // Do a half shuffle with the high mask after shifting its values down.
+ for (int &M : HiMask)
+ if (M >= 0)
+ M -= 4;
+ if (!isNoopShuffleMask(HiMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+
+ return V;
+}
+
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ V1InUse = false;
+ V2InUse = false;
+
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i / Scale] == -1) {
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+ } else {
+ const int ZeroMask = 0x80;
+ int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+ : ZeroMask;
+ int V2Idx = Mask[i / Scale] < Size
+ ? ZeroMask
+ : (Mask[i / Scale] - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, V2),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
+
+ // Cast the result back to the correct type.
+ return DAG.getBitcast(VT, V);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> OrigMask = SVOp->getMask();
+ int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+ OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+ MutableArrayRef<int> Mask(MaskStorage);
+
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+ return ZExt;
+
+ auto isV1 = [](int M) { return M >= 0 && M < 8; };
+ (void)isV1;
+ auto isV2 = [](int M) { return M >= 8; };
+
+ int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+
+ if (NumV2Inputs == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG);
+ }
+
+ assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+ "All single-input shuffles should be canonicalized to be V1-input "
+ "shuffles.");
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Shift;
+
+ // See if we can use SSE4A Extraction / Insertion.
+ if (Subtarget->hasSSE4A())
+ if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return V;
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Inputs == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return V;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return BitBlend;
+
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
+ V2, Mask, DAG))
+ return Unpack;
+
+ // If we can't directly blend but can use PSHUFB, that will be better as it
+ // can both shuffle and set up the inefficient blend.
+ if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+ V1InUse, V2InUse);
+ }
+
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
+/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
+/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
+/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
+ // Figure out whether we're looping over two inputs or just one.
+ bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+ // The modulus for the shuffle vector entries is based on whether this is
+ // a single input or not.
+ int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+ assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+ "We should only be called with masks with a power-of-2 size!");
+
+ uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+ // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+ // and 2^3 simultaneously. This is because we may have ambiguity with
+ // partially undef inputs.
+ bool ViableForN[3] = {true, true, true};
+
+ for (int i = 0, e = Mask.size(); i < e; ++i) {
+ // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+ // want.
+ if (Mask[i] == -1)
+ continue;
+
+ bool IsAnyViable = false;
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j]) {
+ uint64_t N = j + 1;
+
+ // The shuffle mask must be equal to (i * 2^N) % M.
+ if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+ IsAnyViable = true;
+ else
+ ViableForN[j] = false;
+ }
+ // Early exit if we exhaust the possible powers of two.
+ if (!IsAnyViable)
+ break;
+ }
+
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j])
+ return j + 1;
+
+ // Return 0 as there is no viable power of two.
+ return 0;
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use a zext lowering.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return ZExt;
+
+ // See if we can use SSE4A Extraction / Insertion.
+ if (Subtarget->hasSSE4A())
+ if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return V;
+
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+
+ // For single-input shuffles, there are some nicer lowering tricks we can use.
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+ // Notably, this handles splat and partial-splat shuffles more efficiently.
+ // However, it only makes sense if the pre-duplication shuffle simplifies
+ // things significantly. Currently, this means we need to be able to
+ // express the pre-duplication shuffle as an i16 shuffle.
+ //
+ // FIXME: We should check for other patterns which can be widened into an
+ // i16 shuffle as well.
+ auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+ for (int i = 0; i < 16; i += 2)
+ if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
+ return false;
+
+ return true;
+ };
+ auto tryToWidenViaDuplication = [&]() -> SDValue {
+ if (!canWidenViaDuplication(Mask))
+ return SDValue();
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+ LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 8; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+ HiInputs.end());
+
+ bool TargetLo = LoInputs.size() >= HiInputs.size();
+ ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+ ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+ int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ SmallDenseMap<int, int, 8> LaneMap;
+ for (int I : InPlaceInputs) {
+ PreDupI16Shuffle[I/2] = I/2;
+ LaneMap[I] = I;
+ }
+ int j = TargetLo ? 0 : 4, je = j + 4;
+ for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+ // Check if j is already a shuffle of this input. This happens when
+ // there are two adjacent bytes after we move the low one.
+ if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+ // If we haven't yet mapped the input, search for a slot into which
+ // we can map it.
+ while (j < je && PreDupI16Shuffle[j] != -1)
+ ++j;
+
+ if (j == je)
+ // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ return SDValue();
+
+ // Map this input with the i16 shuffle.
+ PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+ }
+
+ // Update the lane map based on the mapping we ended up with.
+ LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+ }
+ V1 = DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+ // Unpack the bytes to form the i16s that will be shuffled into place.
+ V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ MVT::v16i8, V1, V1);
+
+ int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] != -1) {
+ int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+ if (PostDupI16Shuffle[i / 2] == -1)
+ PostDupI16Shuffle[i / 2] = MappedMask;
+ else
+ assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+ "Conflicting entrties in the original shuffle!");
+ }
+ return DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+ };
+ if (SDValue V = tryToWidenViaDuplication())
+ return V;
+ }
+
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+ // with PSHUFB. It is important to do this before we attempt to generate any
+ // blends but after all of the single-input lowerings. If the single input
+ // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+ // want to preserve that and we can DAG combine any longer sequences into
+ // a PSHUFB in the end. But once we start blending from multiple inputs,
+ // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+ // and there are *very* few patterns that would actually be faster than the
+ // PSHUFB approach because of its ability to zero lanes.
+ //
+ // FIXME: The only exceptions to the above are blends which are exact
+ // interleavings with direct instructions supporting them. We currently don't
+ // handle those well here.
+ if (Subtarget->hasSSSE3()) {
+ bool V1InUse = false;
+ bool V2InUse = false;
+
+ SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+ DAG, V1InUse, V2InUse);
+
+ // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+ // do so. This avoids using them to handle blends-with-zero which is
+ // important as a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse) {
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return Blend;
+
+ // We can use an unpack to do the blending rather than an or in some
+ // cases. Even though the or may be (very minorly) more efficient, we
+ // preference this lowering because there are common cases where part of
+ // the complexity of the shuffles goes away when we do the final blend as
+ // an unpack.
+ // FIXME: It might be worth trying to detect if the unpack-feeding
+ // shuffles will both be pshufb, in which case we shouldn't bother with
+ // this.
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Unpack;
+ }
+
+ return PSHUFB;
+ }
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return V;
+
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return BitBlend;
+
+ // Check whether a compaction lowering can be done. This handles shuffles
+ // which take every Nth element for some even N. See the helper function for
+ // details.
+ //
+ // We special case these as they can be particularly efficiently handled with
+ // the PACKUSB instruction on x86 and they show up in common patterns of
+ // rearranging bytes to truncate wide elements.
+ if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+ // NumEvenDrops is the power of two stride of the elements. Another way of
+ // thinking about it is that we need to drop the even elements this many
+ // times to get the original input.
+ bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+ // First we need to zero all the dropped bytes.
+ assert(NumEvenDrops <= 3 &&
+ "No support for dropping even elements more than 3 times.");
+ // We use the mask type to pick which bytes are preserved based on how many
+ // elements are dropped.
+ MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+ SDValue ByteClearMask = DAG.getBitcast(
+ MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+ if (!IsSingleInput)
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+ // Now pack things back together.
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+ for (int i = 1; i < NumEvenDrops; ++i) {
+ Result = DAG.getBitcast(MVT::v8i16, Result);
+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+ }
+
+ return Result;
+ }
+
+ // Handle multi-input cases by blending single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+ Mask, DAG);
+
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
+
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
+
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getBitcast(MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, DL, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ VLoHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+
+ return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ switch (VT.SimpleTy) {
+ case MVT::v2i64:
+ return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v2f64:
+ return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4i32:
+ return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4f32:
+ return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i16:
+ return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16i8:
+ return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Unimplemented!");
+ }
+}
+
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ // If both elements are undef, its trivial.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+ WidenedMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
+ WidenedMask.push_back(Mask[i + 1] / 2);
+ continue;
+ }
+ if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // When zeroing, we need to spread the zeroing across both lanes to widen.
+ if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+ if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+ (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+ WidenedMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ return false;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
+
+ return true;
+}
+
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
+///
+/// This routine just extracts two subvectors, shuffles them independently, and
+/// then concatenates them back together. This should work effectively with all
+/// AVX vector shuffle types.
+static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() >= 256 &&
+ "Only for 256-bit or wider vector shuffles!");
+ assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+ assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+
+ ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
+ ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
+
+ int NumElements = VT.getVectorNumElements();
+ int SplitNumElements = NumElements / 2;
+ MVT ScalarVT = VT.getVectorElementType();
+ MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+
+ // Rather than splitting build-vectors, just build two narrower build
+ // vectors. This helps shuffling with splats and zeros.
+ auto SplitVector = [&](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V->getOperand(0);
+
+ MVT OrigVT = V.getSimpleValueType();
+ int OrigNumElements = OrigVT.getVectorNumElements();
+ int OrigSplitNumElements = OrigNumElements / 2;
+ MVT OrigScalarVT = OrigVT.getVectorElementType();
+ MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+ SDValue LoV, HiV;
+
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV) {
+ LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(0, DL));
+ HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(OrigSplitNumElements, DL));
+ } else {
+
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (int i = 0; i < OrigSplitNumElements; ++i) {
+ LoOps.push_back(BV->getOperand(i));
+ HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+ }
+ LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+ HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+ }
+ return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+ DAG.getBitcast(SplitVT, HiV));
+ };
+
+ SDValue LoV1, HiV1, LoV2, HiV2;
+ std::tie(LoV1, HiV1) = SplitVector(V1);
+ std::tie(LoV2, HiV2) = SplitVector(V2);
+
+ // Now create two 4-way blends of these half-width vectors.
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
+ SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
+ for (int i = 0; i < SplitNumElements; ++i) {
+ int M = HalfMask[i];
+ if (M >= NumElements) {
+ if (M >= NumElements + SplitNumElements)
+ UseHiV2 = true;
+ else
+ UseLoV2 = true;
+ V2BlendMask.push_back(M - NumElements);
+ V1BlendMask.push_back(-1);
+ BlendMask.push_back(SplitNumElements + i);
+ } else if (M >= 0) {
+ if (M >= SplitNumElements)
+ UseHiV1 = true;
+ else
+ UseLoV1 = true;
+ V2BlendMask.push_back(-1);
+ V1BlendMask.push_back(M);
+ BlendMask.push_back(i);
+ } else {
+ V2BlendMask.push_back(-1);
+ V1BlendMask.push_back(-1);
+ BlendMask.push_back(-1);
+ }
+ }
+
+ // Because the lowering happens after all combining takes place, we need to
+ // manually combine these blend masks as much as possible so that we create
+ // a minimal number of high-level vector shuffle nodes.
+
+ // First try just blending the halves of V1 or V2.
+ if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+ return DAG.getUNDEF(SplitVT);
+ if (!UseLoV2 && !UseHiV2)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ if (!UseLoV1 && !UseHiV1)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+ SDValue V1Blend, V2Blend;
+ if (UseLoV1 && UseHiV1) {
+ V1Blend =
+ DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ } else {
+ // We only use half of V1 so map the usage down into the final blend mask.
+ V1Blend = UseLoV1 ? LoV1 : HiV1;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+ BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+ }
+ if (UseLoV2 && UseHiV2) {
+ V2Blend =
+ DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ } else {
+ // We only use half of V2 so map the usage down into the final blend mask.
+ V2Blend = UseLoV2 ? LoV2 : HiV2;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= SplitNumElements)
+ BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+ }
+ return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+ };
+ SDValue Lo = HalfBlend(LoMask);
+ SDValue Hi = HalfBlend(HiMask);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
+ "lower single-input shuffles as it "
+ "could then recurse on itself.");
+ int Size = Mask.size();
+
+ // If this can be modeled as a broadcast of two elements followed by a blend,
+ // prefer that lowering. This is especially important because broadcasts can
+ // often fold with memory operands.
+ auto DoBothBroadcast = [&] {
+ int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= Size) {
+ if (V2BroadcastIdx == -1)
+ V2BroadcastIdx = M - Size;
+ else if (M - Size != V2BroadcastIdx)
+ return false;
+ } else if (M >= 0) {
+ if (V1BroadcastIdx == -1)
+ V1BroadcastIdx = M;
+ else if (M != V1BroadcastIdx)
+ return false;
+ }
+ return true;
+ };
+ if (DoBothBroadcast())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ DAG);
+
+ // If the inputs all stem from a single 128-bit lane of each input, then we
+ // split them rather than blending because the split will decompose to
+ // unusually few instructions.
+ int LaneCount = VT.getSizeInBits() / 128;
+ int LaneSize = Size / LaneCount;
+ SmallBitVector LaneInputs[2];
+ LaneInputs[0].resize(LaneCount, false);
+ LaneInputs[1].resize(LaneCount, false);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+ if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+ // that the decomposed single-input shuffles don't end up here.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a permutation and blend of those lanes.
+///
+/// This essentially blends the out-of-lane inputs to each lane into the lane
+/// from a permuted copy of the vector. This lowering strategy results in four
+/// instructions in the worst case for a single-input cross lane shuffle which
+/// is lower than any other fully general cross-lane shuffle strategy I'm aware
+/// of. Special cases for each particular shuffle pattern should be handled
+/// prior to trying this lowering.
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // FIXME: This should probably be generalized for 512-bit vectors as well.
+ assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
+ int LaneSize = Mask.size() / 2;
+
+ // If there are only inputs from one 128-bit lane, splitting will in fact be
+ // less expensive. The flags track whether the given lane contains an element
+ // that crosses to another lane.
+ bool LaneCrossing[2] = {false, false};
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneCrossing[0] || !LaneCrossing[1])
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ SmallVector<int, 32> FlippedBlendMask;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ FlippedBlendMask.push_back(
+ Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+ ? Mask[i]
+ : Mask[i] % LaneSize +
+ (i / LaneSize) * LaneSize + Size));
+
+ // Flip the vector, and blend the results which should now be in-lane. The
+ // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+ // 5 for the high source. The value 3 selects the high half of source 2 and
+ // the value 2 selects the low half of source 2. We only use source 2 to
+ // allow folding it into a memory operand.
+ unsigned PERMMask = 3 | 2 << 4;
+ SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+ V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+ }
+
+ // This now reduces to two single-input shuffles of V1 and V2 which at worst
+ // will be handled by the above logic and a blend of the results, much like
+ // other patterns in AVX.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ // TODO: If minimizing size and one of the inputs is a zero vector and the
+ // the zero vector has only one use, we could use a VPERM2X128 to save the
+ // instruction bytes needed to explicitly generate the zero vector.
+
+ // Blends are faster and handle all the non-lane-crossing cases.
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // If either input operand is a zero vector, use VPERM2X128 because its mask
+ // allows us to replace the zero input with an implicit zero.
+ if (!IsV1Zero && !IsV2Zero) {
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+ }
+
+ // Otherwise form a 128-bit permutation. After accounting for undefs,
+ // convert the 64-bit shuffle mask selection values into 128-bit
+ // selection bits by dividing the indexes by 2 and shifting into positions
+ // defined by a vperm2*128 instruction's immediate control byte.
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ int MaskLO = Mask[0];
+ if (MaskLO == SM_SentinelUndef)
+ MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+ int MaskHI = Mask[2];
+ if (MaskHI == SM_SentinelUndef)
+ MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+ unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+ // If either input is a zero vector, replace it with an undef input.
+ // Shuffle mask values < 4 are selecting elements of V1.
+ // Shuffle mask values >= 4 are selecting elements of V2.
+ // Adjust each half of the permute mask by clearing the half that was
+ // selecting the zero vector and setting the zero mask bit.
+ if (IsV1Zero) {
+ V1 = DAG.getUNDEF(VT);
+ if (MaskLO < 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI < 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+ if (IsV2Zero) {
+ V2 = DAG.getUNDEF(VT);
+ if (MaskLO >= 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI >= 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This is only useful with multiple inputs.");
+
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int NumLanes = Size / LaneSize;
+ assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+ // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+ // check whether the in-128-bit lane shuffles share a repeating pattern.
+ SmallVector<int, 4> Lanes;
+ Lanes.resize(NumLanes, -1);
+ SmallVector<int, 4> InLaneMask;
+ InLaneMask.resize(LaneSize, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ int j = i / LaneSize;
+
+ if (Lanes[j] < 0) {
+ // First entry we've seen for this lane.
+ Lanes[j] = Mask[i] / LaneSize;
+ } else if (Lanes[j] != Mask[i] / LaneSize) {
+ // This doesn't match the lane selected previously!
+ return SDValue();
+ }
+
+ // Check that within each lane we have a consistent shuffle mask.
+ int k = i % LaneSize;
+ if (InLaneMask[k] < 0) {
+ InLaneMask[k] = Mask[i] % LaneSize;
+ } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+ // This doesn't fit a repeating in-lane mask.
+ return SDValue();
+ }
+ }
+
+ // First shuffle the lanes into place.
+ MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+ VT.getSizeInBits() / 64);
+ SmallVector<int, 8> LaneMask;
+ LaneMask.resize(NumLanes * 2, -1);
+ for (int i = 0; i < NumLanes; ++i)
+ if (Lanes[i] >= 0) {
+ LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+ LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+ }
+
+ V1 = DAG.getBitcast(LaneVT, V1);
+ V2 = DAG.getBitcast(LaneVT, V2);
+ SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+ // Cast it back to the type we actually want.
+ LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
+
+ // Now do a simple shuffle that isn't lane crossing.
+ SmallVector<int, 8> NewMask;
+ NewMask.resize(Size, -1);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+ assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+ "Must not introduce lane crosses at this point!");
+
+ return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+ bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+ bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+ if (!UndefLower && !UndefUpper)
+ return SDValue();
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (UndefUpper &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+ if (UndefLower && Subtarget->hasAVX2() &&
+ (VT == MVT::v4f64 || VT == MVT::v4i64))
+ return SDValue();
+
+ // If the shuffle only uses the lower halves of the input operands,
+ // then extract them and perform the 'half' shuffle at half width.
+ // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+ int HalfIdx1 = -1, HalfIdx2 = -1;
+ SmallVector<int, 8> HalfMask;
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
+ for (unsigned i = 0; i != HalfNumElts; ++i) {
+ int M = Mask[i + Offset];
+ if (M < 0) {
+ HalfMask.push_back(M);
+ continue;
+ }
+
+ // Determine which of the 4 half vectors this element is from.
+ // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+ int HalfIdx = M / HalfNumElts;
+
+ // Only shuffle using the lower halves of the inputs.
+ // TODO: Investigate usefulness of shuffling with upper halves.
+ if (HalfIdx != 0 && HalfIdx != 2)
+ return SDValue();
+
+ // Determine the element index into its half vector source.
+ int HalfElt = M % HalfNumElts;
+
+ // We can shuffle with up to 2 half vectors, set the new 'half'
+ // shuffle mask accordingly.
+ if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
+ HalfMask.push_back(HalfElt);
+ HalfIdx1 = HalfIdx;
+ continue;
+ }
+ if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
+ HalfMask.push_back(HalfElt + HalfNumElts);
+ HalfIdx2 = HalfIdx;
+ continue;
+ }
+
+ // Too many half vectors referenced.
+ return SDValue();
+ }
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ auto GetHalfVector = [&](int HalfIdx) {
+ if (HalfIdx < 0)
+ return DAG.getUNDEF(HalfVT);
+ SDValue V = (HalfIdx < 2 ? V1 : V2);
+ HalfIdx = (HalfIdx % 2) * HalfNumElts;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+ DAG.getIntPtrConstant(HalfIdx, DL));
+ };
+
+ SDValue Half1 = GetHalfVector(HalfIdx1);
+ SDValue Half2 = GetHalfVector(HalfIdx2);
+ SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+ DAG.getIntPtrConstant(Offset, DL));
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+ return false;
+
+ return true;
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+
+ // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
+ // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
+ assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
+ int NumElts = VT.getVectorNumElements();
+ bool ShufpdMask = true;
+ bool CommutableMask = true;
+ unsigned Immediate = 0;
+ for (int i = 0; i < NumElts; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ int Val = (i & 6) + NumElts * (i & 1);
+ int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
+ if (Mask[i] < Val || Mask[i] > Val + 1)
+ ShufpdMask = false;
+ if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
+ CommutableMask = false;
+ Immediate |= (Mask[i] % 2) << i;
+ }
+ if (ShufpdMask)
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ DAG.getConstant(Immediate, DL, MVT::i8));
+ if (CommutableMask)
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+ DAG.getConstant(Immediate, DL, MVT::i8));
+ return SDValue();
+}
+
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallVector<int, 4> WidenedMask;
+ if (canWidenShuffleElements(Mask, WidenedMask))
+ return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
+ DAG);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
+ if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowerid with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+ DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ }
+
+ // With AVX2 we have direct support for this permutation.
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Otherwise, fall back.
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
+ DAG);
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return V;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (SDValue Op =
+ lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return Op;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because an v4 we
+ // can fully permute the elements.
+ if (Subtarget->hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+ SmallVector<int, 4> WidenedMask;
+ if (canWidenShuffleElements(Mask, WidenedMask))
+ return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
+ DAG);
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
+ // use lower latency instructions that will operate on both 128-bit lanes.
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+ if (isSingleInputShuffleMask(Mask)) {
+ int PSHUFDMask[] = {-1, -1, -1, -1};
+ for (int i = 0; i < 2; ++i)
+ if (RepeatedMask[i] >= 0) {
+ PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
+ PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
+ }
+ return DAG.getBitcast(
+ MVT::v4i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+ DAG.getBitcast(MVT::v8i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+ }
+
+ // AVX2 provides a direct instruction for permuting a single input across
+ // lanes.
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 &&
+ "Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ return V;
+
+ // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+ // have already handled any direct blends. We also need to squash the
+ // repeated mask into a simulated v4f32 mask.
+ for (int i = 0; i < 4; ++i)
+ if (RepeatedMask[i] >= 8)
+ RepeatedMask[i] -= 4;
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ }
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // two 128-bit lanes use the variable mask to VPERMILPS.
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], DL, MVT::i32);
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+ return DAG.getNode(
+ X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
+
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(
+ X86ISD::VPERMV, DL, MVT::v8f32,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+
+ // Otherwise, fall back.
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+ DAG);
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because at v8 we
+ // can fully permute the elements.
+ if (Subtarget->hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the two 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // If the shuffle patterns aren't repeated but it is a single input, directly
+ // generate a cross-lane VPERMD instruction.
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], DL, MVT::i32);
+ return DAG.getNode(
+ X86ISD::VPERMV, DL, MVT::v8i32,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+
+ SDValue PSHUFBMask[32];
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i] == -1) {
+ PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+ continue;
+ }
+
+ int M = i < 8 ? Mask[i] : Mask[i] - 8;
+ assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+ PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
+ PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
+ }
+ return DAG.getBitcast(MVT::v16i16,
+ DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
+ DAG.getBitcast(MVT::v32i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v32i8, PSHUFBMask)));
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+ Mask, DAG);
+
+ SDValue PSHUFBMask[32];
+ for (int i = 0; i < 32; ++i)
+ PSHUFBMask[i] =
+ Mask[i] < 0
+ ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
+ MVT::i8);
+
+ return DAG.getNode(
+ X86ISD::PSHUFB, DL, MVT::v32i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = VT.getVectorNumElements();
+ int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+ return M >= NumElts;
+ });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+ // can check for those subtargets here and avoid much of the subtarget
+ // querying in the per-vector-type lowering routines. With AVX1 we have
+ // essentially *zero* ability to manipulate a 256-bit vector with integer
+ // types. Since we'll use floating point types there eventually, just
+ // immediately cast everything to a float and operate entirely in that domain.
+ if (VT.isInteger() && !Subtarget->hasAVX2()) {
+ int ElementBits = VT.getScalarSizeInBits();
+ if (ElementBits < 32)
+ // No floating point type available, decompose into 128-bit vectors.
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+ VT.getVectorNumElements());
+ V1 = DAG.getBitcast(FpVT, V1);
+ V2 = DAG.getBitcast(FpVT, V2);
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+ }
+
+ switch (VT.SimpleTy) {
+ case MVT::v4f64:
+ return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4i64:
+ return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8f32:
+ return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i32:
+ return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16i16:
+ return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v32i8:
+ return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Not a valid 256-bit x86 vector type!");
+ }
+}
+
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ assert(VT.getScalarSizeInBits() == 64 &&
+ "Unexpected element type size for 128bit shuffle.");
+
+ // To handle 256 bit vector requires VLX and most probably
+ // function lowerV2X128VectorShuffle() is better solution.
+ assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
+ // Form a 128-bit permutation.
+ // Convert the 64-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ unsigned PermMask = 0, Imm = 0;
+ unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ if (WidenedMask[i] == SM_SentinelZero)
+ return SDValue();
+
+ // Use first element in place of undef mask.
+ Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+ PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+
+ assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
+
+ MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+ SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Shuf128;
+
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
+ assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 512-bit x86 vector
+/// shuffle or splits it into two 256-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Subtarget->hasAVX512() &&
+ "Cannot lower 512-bit vectors w/ basic ISA!");
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast =
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Dispatch to each element type for lowering. If we don't have supprot for
+ // specific element type shuffles at 512 bits, immediately split them and
+ // lower them. Each lowering routine of a given type is allowed to assume that
+ // the requisite ISA extensions for that element type are available.
+ switch (VT.SimpleTy) {
+ case MVT::v8f64:
+ return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16f32:
+ return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i64:
+ return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16i32:
+ return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v32i16:
+ if (Subtarget->hasBWI())
+ return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ break;
+ case MVT::v64i8:
+ if (Subtarget->hasBWI())
+ return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ break;
+
+ default:
+ llvm_unreachable("Not a valid 512-bit x86 vector type!");
+ }
+
+ // Otherwise fall back on splitting.
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+}
+
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Subtarget->hasAVX512() &&
+ "Cannot lower 512-bit vectors w/o basic ISA!");
+ MVT ExtVT;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("Expected a vector of i1 elements");
+ case MVT::v2i1:
+ ExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ ExtVT = MVT::v4i32;
+ break;
+ case MVT::v8i1:
+ ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+ break;
+ case MVT::v16i1:
+ ExtVT = MVT::v16i32;
+ break;
+ case MVT::v32i1:
+ ExtVT = MVT::v32i16;
+ break;
+ case MVT::v64i1:
+ ExtVT = MVT::v64i8;
+ break;
+ }
+
+ if (ISD::isBuildVectorAllZeros(V1.getNode()))
+ V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+ else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+ V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ else
+ V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+ if (V2.isUndef())
+ V2 = DAG.getUNDEF(ExtVT);
+ else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+ V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+ else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+ V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ else
+ V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT,
+ DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
+}
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getSimpleValueType();
+ int NumElements = VT.getVectorNumElements();
+ SDLoc dl(Op);
+ bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
+
+ assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+ "Can't lower MMX shuffles");
+
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ // When we create a shuffle node we put the UNDEF node to second operand,
+ // but in some cases the first operand may be transformed to UNDEF.
+ // In this case we should just commute the node.
+ if (V1IsUndef)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // Check for non-undef masks pointing at an undef vector and make the masks
+ // undef as well. This makes it easier to match the shuffle based solely on
+ // the mask.
+ if (V2IsUndef)
+ for (int M : Mask)
+ if (M >= NumElements) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+ }
+
+ // We actually see shuffles that are entirely re-arrangements of a set of
+ // zero inputs. This mostly happens while decomposing complex shuffles into
+ // simple ones. Directly lower these as a buildvector of zeros.
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.all())
+ return getZeroVector(VT, Subtarget, DAG, dl);
+
+ // Try to collapse shuffles into using a vector type with fewer elements but
+ // wider element types. We cap this to not form integers or floating point
+ // elements wider than 64 bits, but it might be interesting to form i128
+ // integers to handle flipping the low and high halves of AVX 256-bit vectors.
+ SmallVector<int, 16> WidenedMask;
+ if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
+ canWidenShuffleElements(Mask, WidenedMask)) {
+ MVT NewEltVT = VT.isFloatingPoint()
+ ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+ : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+ MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+ // Make sure that the new vector type is legal. For example, v2f64 isn't
+ // legal on SSE1.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+ V1 = DAG.getBitcast(NewVT, V1);
+ V2 = DAG.getBitcast(NewVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+ }
+ }
+
+ int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+ for (int M : SVOp->getMask())
+ if (M < 0)
+ ++NumUndefElements;
+ else if (M < NumElements)
+ ++NumV1Elements;
+ else
+ ++NumV2Elements;
+
+ // Commute the shuffle as needed such that more elements come from V1 than
+ // V2. This allows us to match the shuffle pattern strictly on how many
+ // elements come from V1 without handling the symmetric cases.
+ if (NumV2Elements > NumV1Elements)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // When the number of V1 and V2 elements are the same, try to minimize the
+ // number of uses of V2 in the low half of the vector. When that is tied,
+ // ensure that the sum of indices for V1 is equal to or lower than the sum
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
+ if (NumV1Elements == NumV2Elements) {
+ int LowV1Elements = 0, LowV2Elements = 0;
+ for (int M : SVOp->getMask().slice(0, NumElements / 2))
+ if (M >= NumElements)
+ ++LowV2Elements;
+ else if (M >= 0)
+ ++LowV1Elements;
+ if (LowV2Elements > LowV1Elements) {
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ } else if (LowV2Elements == LowV1Elements) {
+ int SumV1Indices = 0, SumV2Indices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ SumV2Indices += i;
+ else if (SVOp->getMask()[i] >= 0)
+ SumV1Indices += i;
+ if (SumV2Indices < SumV1Indices) {
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ } else if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (SVOp->getMask()[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ }
+ }
+ }
+
+ // For each vector width, delegate to a specialized lowering routine.
+ if (VT.is128BitVector())
+ return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+ if (VT.is256BitVector())
+ return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+ if (VT.is512BitVector())
+ return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+ if (Is1BitVector)
+ return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+ llvm_unreachable("Unimplemented!");
+}
+
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+ unsigned &MaskValue) {
+ MaskValue = 0;
+ unsigned NumElems = BuildVector->getNumOperands();
+
+ // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ // We don't handle the >2 lanes case right now.
+ unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ if (NumLanes > 2)
+ return false;
+
+ unsigned NumElemsInLane = NumElems / NumLanes;
+
+ // Blend for v16i16 should be symmetric for the both lanes.
+ for (unsigned i = 0; i < NumElemsInLane; ++i) {
+ SDValue EltCond = BuildVector->getOperand(i);
+ SDValue SndLaneEltCond =
+ (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+ int Lane1Cond = -1, Lane2Cond = -1;
+ if (isa<ConstantSDNode>(EltCond))
+ Lane1Cond = !isNullConstant(EltCond);
+ if (isa<ConstantSDNode>(SndLaneEltCond))
+ Lane2Cond = !isNullConstant(SndLaneEltCond);
+
+ unsigned LaneMask = 0;
+ if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+ // Lane1Cond != 0, means we want the first argument.
+ // Lane1Cond == 0, means we want the second argument.
+ // The encoding of this argument is 0 for the first argument, 1
+ // for the second. Therefore, invert the condition.
+ LaneMask = !Lane1Cond << i;
+ else if (Lane1Cond < 0)
+ LaneMask = !Lane2Cond << i;
+ else
+ return false;
+
+ MaskValue |= LaneMask;
+ if (NumLanes == 2)
+ MaskValue |= LaneMask << NumElemsInLane;
+ }
+ return true;
+}
+
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+ auto *CondBV = cast<BuildVectorSDNode>(Cond);
+
+ // Only non-legal VSELECTs reach this lowering, convert those into generic
+ // shuffles and re-use the shuffle lowering path for blends.
+ SmallVector<int, 32> Mask;
+ for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+ SDValue CondElt = CondBV->getOperand(i);
+ Mask.push_back(
+ isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+ : -1);
+ }
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+ return SDValue();
+
+ // Try to lower this to a blend-style vector shuffle. This can handle all
+ // constant condition cases.
+ if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
+ return BlendOp;
+
+ // Variable blends are only legal from SSE4.1 onward.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (Op.getSimpleValueType().SimpleTy) {
+ default:
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget->hasAVX2())
+ return Op;
+
+ return SDValue();
+
+ case MVT::v8i16:
+ case MVT::v16i16:
+ // AVX-512 BWI and VLX features support VSELECT with i16 elements.
+ if (Subtarget->hasBWI() && Subtarget->hasVLX())
+ return Op;
+
+ // FIXME: We should custom lower this by fixing the condition and using i8
+ // blends.
+ return SDValue();
+ }
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+ return SDValue();
+
+ if (VT.getSizeInBits() == 8) {
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT.getSizeInBits() == 16) {
+ // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+ if (isNullConstant(Op.getOperand(1)))
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+ Op.getOperand(1)));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT == MVT::f32) {
+ // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+ // the result back to FR32 register. It's only worth matching if the
+ // result has a single use which is a store or a bitcast to i32. And in
+ // the case of a store, it's not worth it if the index is a constant 0,
+ // because a MOVSSmr can be used instead, which is smaller and faster.
+ if (!Op.hasOneUse())
+ return SDValue();
+ SDNode *User = *Op.getNode()->use_begin();
+ if ((User->getOpcode() != ISD::STORE ||
+ isNullConstant(Op.getOperand(1))) &&
+ (User->getOpcode() != ISD::BITCAST ||
+ User->getValueType(0) != MVT::i32))
+ return SDValue();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+ Op.getOperand(1));
+ return DAG.getBitcast(MVT::f32, Extract);
+ }
+
+ if (VT == MVT::i32 || VT == MVT::i64) {
+ // ExtractPS/pextrq works with constant index.
+ if (isa<ConstantSDNode>(Op.getOperand(1)))
+ return Op;
+ }
+ return SDValue();
+}
+
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDLoc dl(Vec);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+ MVT EltVT = Op.getSimpleValueType();
+
+ assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
+
+ // variable index can't be handled in mask registers,
+ // extend vector to VR512
+ if (!isa<ConstantSDNode>(Idx)) {
+ MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ ExtVT.getVectorElementType(), Ext, Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+ rc = getRegClassFor(MVT::v16i1);
+ unsigned MaxSift = rc->getSize()*8 - 1;
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift, dl, MVT::i8));
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+
+ if (Op.getSimpleValueType() == MVT::i1)
+ return ExtractBitFromMaskVector(Op, DAG);
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ if (VecVT.is512BitVector() ||
+ (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+ VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+ MVT MaskEltVT =
+ MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+ MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+ MaskEltVT.getSizeInBits());
+
+ Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+ getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+ DAG.getConstant(0, dl, PtrVT));
+ SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+ DAG.getConstant(0, dl, PtrVT));
+ }
+ return SDValue();
+ }
+
+ // If this is a 256-bit vector result, first extract the 128-bit vector and
+ // then extract the element from the 128-bit vector.
+ if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ // Get the 128-bit vector.
+ Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+ MVT EltVT = VecVT.getVectorElementType();
+
+ unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+ // this can be done with a mask.
+ IdxVal &= ElemsPerChunk - 1;
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i32));
+ }
+
+ assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+ if (Subtarget->hasSSE41())
+ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
+ return Res;
+
+ MVT VT = Op.getSimpleValueType();
+ // TODO: handle v16i8.
+ if (VT.getSizeInBits() == 16) {
+ SDValue Vec = Op.getOperand(0);
+ if (isNullConstant(Op.getOperand(1)))
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec),
+ Op.getOperand(1)));
+ // Transform it so it match pextrw which produces a 32-bit result.
+ MVT EltVT = MVT::i32;
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT.getSizeInBits() == 32) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // SHUFPS the element to the lowest double word, then movss.
+ int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
+ MVT VVT = Op.getOperand(0).getSimpleValueType();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (VT.getSizeInBits() == 64) {
+ // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+ // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+ // to match extract_elt for f64.
+ if (isNullConstant(Op.getOperand(1)))
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ int Mask[2] = { 1, -1 };
+ MVT VVT = Op.getOperand(0).getSimpleValueType();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ return SDValue();
+}
+
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ // Non constant index. Extend source and destination,
+ // insert element and then truncate the result.
+ MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ if (IdxVal)
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return EltInVec;
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
+
+SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+
+ if (EltVT == MVT::i1)
+ return InsertBitToMaskVector(Op, DAG);
+
+ SDLoc dl(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+ if (!isa<ConstantSDNode>(N2))
+ return SDValue();
+ auto *N2C = cast<ConstantSDNode>(N2);
+ unsigned IdxVal = N2C->getZExtValue();
+
+ // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
+ // into that, and then insert the subvector back into the result.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ // With a 256-bit vector, we can insert into the zero element efficiently
+ // using a blend if we have AVX or AVX2 and the right data type.
+ if (VT.is256BitVector() && IdxVal == 0) {
+ // TODO: It is worthwhile to cast integer to floating point and back
+ // and incur a domain crossing penalty if that's what we'll end up
+ // doing anyway after extracting to a 128-bit vector.
+ if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+ SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ N2 = DAG.getIntPtrConstant(1, dl);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ }
+ }
+
+ // Get the desired 128-bit vector chunk.
+ SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+
+ // Insert the element into the desired chunk.
+ unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(NumEltsIn128));
+ // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+ unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
+
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+ DAG.getConstant(IdxIn128, dl, MVT::i32));
+
+ // Insert the changed part back into the bigger vector
+ return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+ }
+ assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+
+ if (Subtarget->hasSSE41()) {
+ if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
+ unsigned Opc;
+ if (VT == MVT::v8i16) {
+ Opc = X86ISD::PINSRW;
+ } else {
+ assert(VT == MVT::v16i8);
+ Opc = X86ISD::PINSRB;
+ }
+
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ }
+
+ if (EltVT == MVT::f32) {
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1, dl);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
+ N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
+ // Create this as a scalar to vector..
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ }
+
+ if (EltVT == MVT::i32 || EltVT == MVT::i64) {
+ // PINSR* works with constant index.
+ return Op;
+ }
+ }
+
+ if (EltVT == MVT::i8)
+ return SDValue();
+
+ if (EltVT.getSizeInBits() == 16) {
+ // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+ // as its second argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+ }
+ return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT OpVT = Op.getSimpleValueType();
+
+ // If this is a 256-bit vector result, first insert into a 128-bit
+ // vector and then insert into the 256-bit vector.
+ if (!OpVT.is128BitVector()) {
+ // Insert into a 128-bit vector.
+ unsigned SizeFactor = OpVT.getSizeInBits()/128;
+ MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+ OpVT.getVectorNumElements() / SizeFactor);
+
+ Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+ // Insert the 128-bit vector.
+ return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+ }
+
+ if (OpVT == MVT::v1i64 &&
+ Op.getOperand(0).getValueType() == MVT::i64)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+ SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ return DAG.getBitcast(
+ OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ SDValue In = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT ResVT = Op.getSimpleValueType();
+ MVT InVT = In.getSimpleValueType();
+
+ if (Subtarget->hasFp256()) {
+ if (ResVT.is128BitVector() &&
+ (InVT.is256BitVector() || InVT.is512BitVector()) &&
+ isa<ConstantSDNode>(Idx)) {
+ return Extract128BitVector(In, IdxVal, DAG, dl);
+ }
+ if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ return Extract256BitVector(In, IdxVal, DAG, dl);
+ }
+ }
+ return SDValue();
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // Fold two 16-byte subvector loads into one 32-byte load:
+ // (insert_subvector (insert_subvector undef, (load addr), 0),
+ // (load addr + 16), Elts/2)
+ // --> load32 addr
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+ auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+ if (Idx2 && Idx2->getZExtValue() == 0) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ // If needed, look through a bitcast to get to the load.
+ if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
+ SubVec2 = SubVec2.getOperand(0);
+
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
+ bool Fast;
+ unsigned Alignment = FirstLd->getAlignment();
+ unsigned AS = FirstLd->getAddressSpace();
+ const X86TargetLowering *TLI = Subtarget->getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ OpVT, AS, Alignment, &Fast) && Fast) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+ return Ld;
+ }
+ }
+ }
+ }
+
+ if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ SubVecVT.is128BitVector())
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return Insert1BitVector(Op, DAG);
+
+ return SDValue();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+ SDLoc DL(CP);
+ Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+ SDLoc DL(JT);
+ Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag)
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+ OpFlag = X86II::MO_GOTPCREL;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOT;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+ } else if (Subtarget->isPICStyleStubNoDynamic()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY;
+ }
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
+
+ SDLoc DL(Op);
+ Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->is64Bit()) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+ }
+
+ // For symbols that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlag))
+ Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ // Create the TargetBlockAddressAddress node.
+ unsigned char OpFlags =
+ Subtarget->ClassifyBlockAddressReference();
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
+ else
+ Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+ }
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+ int64_t Offset, SelectionDAG &DAG) const {
+ // Create the TargetGlobalAddress node, folding in the constant
+ // offset if it is legal.
+ unsigned char OpFlags =
+ Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ // A direct static reference to a global.
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+ Offset = 0;
+ } else {
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+ }
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
+ else
+ Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+ }
+
+ // For globals that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlags))
+ Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+ DAG.getConstant(Offset, dl, PtrVT));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+ SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags, bool LocalDynamic = false) {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDLoc dl(GA);
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(),
+ OperandFlags);
+
+ X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
+
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA, *InFlag };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+ } else {
+ SDValue Ops[] = { Chain, TGA };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+ }
+
+ // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+ MFI->setAdjustsStack(true);
+ MFI->setHasCalls(true);
+
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ SDValue InFlag;
+ SDLoc dl(GA); // ? function entry point might be better
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ SDLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+ X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT,
+ bool is64Bit) {
+ SDLoc dl(GA);
+
+ // Get the start address of the TLS block for this module.
+ X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+ .getInfo<X86MachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ SDValue Base;
+ if (is64Bit) {
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+ X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ } else {
+ SDValue InFlag;
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+ Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+ X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ }
+
+ // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+ // of Base.
+
+ // Build x@dtpoff.
+ unsigned char OperandFlags = X86II::MO_DTPOFF;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ // Add x@dtpoff with the base.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT, TLSModel::Model model,
+ bool is64Bit, bool isPIC) {
+ SDLoc dl(GA);
+
+ // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+ Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+ is64Bit ? 257 : 256));
+
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+ MachinePointerInfo(Ptr), false, false, false, 0);
+
+ unsigned char OperandFlags = 0;
+ // Most TLS accesses are not RIP relative, even on x86-64. One exception is
+ // initialexec.
+ unsigned WrapperKind = X86ISD::Wrapper;
+ if (model == TLSModel::LocalExec) {
+ OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+ } else if (model == TLSModel::InitialExec) {
+ if (is64Bit) {
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+ }
+ } else {
+ llvm_unreachable("Unexpected model");
+ }
+
+ // emit "addl x@ntpoff,%eax" (local exec)
+ // or "addl x@indntpoff,%eax" (initial exec)
+ // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+ SDValue TGA =
+ DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ if (model == TLSModel::InitialExec) {
+ if (isPIC && !is64Bit) {
+ Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ Offset);
+ }
+
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
+ }
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ // Cygwin uses emutls.
+ // FIXME: It may be EmulatedTLS-generic also for X86-Android.
+ if (Subtarget->isTargetWindowsCygwin())
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ const GlobalValue *GV = GA->getGlobal();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Subtarget->isTargetELF()) {
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+ TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ if (Subtarget->is64Bit())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
+ Subtarget->is64Bit());
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
+ DAG.getTarget().getRelocationModel() ==
+ Reloc::PIC_);
+ }
+ llvm_unreachable("Unknown TLS model.");
+ }
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin only has one model of TLS. Lower to that.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+ X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+ !Subtarget->is64Bit();
+ if (PIC32)
+ OpFlag = X86II::MO_TLVP_PIC_BASE;
+ else
+ OpFlag = X86II::MO_TLVP;
+ SDLoc DL(Op);
+ SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+ GA->getValueType(0),
+ GA->getOffset(), OpFlag);
+ SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC32, the address is actually $g + Offset.
+ if (PIC32)
+ Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ Offset);
+
+ // Lowering the machine isd will make sure everything is in the right
+ // location.
+ SDValue Chain = DAG.getEntryNode();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Args[] = { Chain, Offset };
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+
+ // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setAdjustsStack(true);
+
+ // And our return value (tls address) is in the standard call return value
+ // location.
+ unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
+ }
+
+ if (Subtarget->isTargetKnownWindowsMSVC() ||
+ Subtarget->isTargetWindowsGNU()) {
+ // Just use the implicit TLS architecture
+ // Need to generate someting similar to:
+ // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+ // ; from TEB
+ // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
+ // mov rcx, qword [rdx+rcx*8]
+ // mov eax, .tls$:tlsvar
+ // [rax+rcx] contains the address
+ // Windows 64bit: gs:0x58
+ // Windows 32bit: fs:__tls_array
+
+ SDLoc dl(GA);
+ SDValue Chain = DAG.getEntryNode();
+
+ // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+ // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+ // use its literal value of 0x2C.
+ Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+ ? Type::getInt8PtrTy(*DAG.getContext(),
+ 256)
+ : Type::getInt32PtrTy(*DAG.getContext(),
+ 257));
+
+ SDValue TlsArray = Subtarget->is64Bit()
+ ? DAG.getIntPtrConstant(0x58, dl)
+ : (Subtarget->isTargetWindowsGNU()
+ ? DAG.getIntPtrConstant(0x2C, dl)
+ : DAG.getExternalSymbol("_tls_array", PtrVT));
+
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
+ false, false, 0);
+
+ SDValue res;
+ if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
+ res = ThreadPointer;
+ } else {
+ // Load the _tls_index variable
+ SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
+ if (Subtarget->is64Bit())
+ IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
+ MachinePointerInfo(), MVT::i32, false, false,
+ false, 0);
+ else
+ IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
+ false, false, 0);
+
+ auto &DL = DAG.getDataLayout();
+ SDValue Scale =
+ DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+ IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
+
+ res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
+ }
+
+ res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
+ false, 0);
+
+ // Get the offset of start of .tls section
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), X86II::MO_SECREL);
+ SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
+ }
+
+ llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ MVT VT = Op.getSimpleValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+ // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+ // during isel.
+ SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits - 1, dl, MVT::i8));
+ SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, MVT::i8))
+ : DAG.getConstant(0, dl, VT);
+
+ SDValue Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+ } else {
+ Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+ }
+
+ // If the shift amount is larger or equal than the width of a part we can't
+ // rely on the results of shld/shrd. Insert a test and select the appropriate
+ // values for large shift amounts.
+ SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i8));
+ SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ AndNode, DAG.getConstant(0, dl, MVT::i8));
+
+ SDValue Hi, Lo;
+ SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+ SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+ } else {
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+ }
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (SrcVT.isVector()) {
+ if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+ return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(SrcVT)));
+ }
+ if (SrcVT.getVectorElementType() == MVT::i1) {
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
+ }
+ return SDValue();
+ }
+
+ assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ // These are really Legal; return the operand so the caller accepts it as
+ // Legal.
+ if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+ return Op;
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ Subtarget->is64Bit()) {
+ return Op;
+ }
+
+ unsigned Size = SrcVT.getSizeInBits()/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ SDValue Chain = DAG.getStore(
+ DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
+ false, 0);
+ return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+ SDValue StackSlot,
+ SelectionDAG &DAG) const {
+ // Build the FILD
+ SDLoc DL(Op);
+ SDVTList Tys;
+ bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ if (useSSE)
+ Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+ else
+ Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+ unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+ FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+ MachineMemOperand *MMO;
+ if (FI) {
+ int SSFI = FI->getIndex();
+ MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOLoad, ByteSize, ByteSize);
+ } else {
+ MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ StackSlot = StackSlot.getOperand(1);
+ }
+ SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+ SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+ X86ISD::FILD, DL,
+ Tys, Ops, SrcVT, MMO);
+
+ if (useSSE) {
+ Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {
+ Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+ };
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+ Ops, Op.getValueType(), MMO);
+ Result = DAG.getLoad(
+ Op.getValueType(), DL, Chain, StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ false, false, false, 0);
+ }
+
+ return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+ SelectionDAG &DAG) const {
+ // This algorithm is not obvious. Here it is what we're trying to output:
+ /*
+ movq %rax, %xmm0
+ punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+ subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+ #ifdef __SSE3__
+ haddpd %xmm0, %xmm0
+ #else
+ pshufd $0x4e, %xmm0, %xmm1
+ addpd %xmm1, %xmm0
+ #endif
+ */
+
+ SDLoc dl(Op);
+ LLVMContext *Context = DAG.getContext();
+
+ // Build some magic constants.
+ static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ Constant *C0 = ConstantDataVector::get(*Context, CV0);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+
+ SmallVector<Constant*,2> CV1;
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 0x4330000000000000ULL))));
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 0x4530000000000000ULL))));
+ Constant *C1 = ConstantVector::get(CV1);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+
+ // Load the 64-bit value into an XMM register.
+ SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Op.getOperand(0));
+ SDValue CLod0 =
+ DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
+ SDValue Unpck1 =
+ getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
+
+ SDValue CLod1 =
+ DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
+ SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Result;
+
+ if (Subtarget->hasSSE3()) {
+ // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+ Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+ } else {
+ SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
+ SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+ S2F, 0x4E, DAG);
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+ DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
+ }
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // FP constant to bias correct the final result.
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+ MVT::f64);
+
+ // Load the 32-bit value into an XMM register.
+ SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ Op.getOperand(0));
+
+ // Zero out the upper parts of the register.
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+ Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getBitcast(MVT::v2f64, Load),
+ DAG.getIntPtrConstant(0, dl));
+
+ // Or the load with the bias.
+ SDValue Or = DAG.getNode(
+ ISD::OR, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+ Or =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+
+ // Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+ // Handle final rounding.
+ MVT DestVT = Op.getSimpleValueType();
+
+ if (DestVT.bitsLT(MVT::f64))
+ return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+ DAG.getIntPtrConstant(0, dl));
+ if (DestVT.bitsGT(MVT::f64))
+ return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+ // Handle final rounding.
+ return Sub;
+}
+
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // The algorithm is the following:
+ // #ifdef __SSE4_1__
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ // #else
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ // #endif
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // return (float4) lo + fhi;
+
+ // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+ // reassociate the two FADDs, and if we do that, the algorithm fails
+ // spectacularly (PR24512).
+ // FIXME: If we ever have some kind of Machine FMF, this should be marked
+ // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+ // there's also the MachineCombiner reassociations happening on Machine IR.
+ if (DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue V = Op->getOperand(0);
+ MVT VecIntVT = V.getSimpleValueType();
+ bool Is128 = VecIntVT == MVT::v4i32;
+ MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ // If we convert to something else than the supported type, e.g., to v4f64,
+ // abort early.
+ if (VecFloatVT != Op->getSimpleValueType(0))
+ return SDValue();
+
+ unsigned NumElts = VecIntVT.getVectorNumElements();
+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+ "Unsupported custom type");
+ assert(NumElts <= 8 && "The size of the constant array must be fixed");
+
+ // In the #idef/#else code, we have in common:
+ // - The vector of constants:
+ // -- 0x4b000000
+ // -- 0x53000000
+ // - A shift:
+ // -- v >> 16
+
+ // Create the splat vector for 0x4b000000.
+ SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
+ SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
+ CstLow, CstLow, CstLow, CstLow};
+ SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+ makeArrayRef(&CstLowArray[0], NumElts));
+ // Create the splat vector for 0x53000000.
+ SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
+ SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
+ CstHigh, CstHigh, CstHigh, CstHigh};
+ SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+ makeArrayRef(&CstHighArray[0], NumElts));
+
+ // Create the right shift.
+ SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
+ SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
+ CstShift, CstShift, CstShift, CstShift};
+ SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+ makeArrayRef(&CstShiftArray[0], NumElts));
+ SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+ SDValue Low, High;
+ if (Subtarget.hasSSE41()) {
+ MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+ SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
+ // Low will be bitcasted right away, so do not bother bitcasting back to its
+ // original type.
+ Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+ VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+ SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
+ // High will be bitcasted right away, so do not bother bitcasting back to
+ // its original type.
+ High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+ VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ } else {
+ SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
+ SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
+ CstMask, CstMask, CstMask);
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+ Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+ }
+
+ // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
+ SDValue CstFAdd = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
+ SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
+ CstFAdd, CstFAdd, CstFAdd, CstFAdd};
+ SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
+ makeArrayRef(&CstFAddArray[0], NumElts));
+
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue FHigh =
+ DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
+ // return (float4) lo + fhi;
+ SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+ return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ MVT SVT = N0.getSimpleValueType();
+ SDLoc dl(Op);
+
+ switch (SVT.SimpleTy) {
+ default:
+ llvm_unreachable("Custom UINT_TO_FP is not supported!");
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v8i8:
+ case MVT::v8i16: {
+ MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+ }
+ case MVT::v4i32:
+ case MVT::v8i32:
+ return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+ case MVT::v16i8:
+ case MVT::v16i16:
+ assert(Subtarget->hasAVX512());
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+ }
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Op.getSimpleValueType().isVector())
+ return lowerUINT_TO_FP_vec(Op, DAG);
+
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(N0))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+ MVT SrcVT = N0.getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
+
+ if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+ (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+ // Conversions from unsigned i32 to f32/f64 are legal,
+ // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
+ return Op;
+ }
+
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i64(Op, DAG);
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i32(Op, DAG);
+ if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+ return SDValue();
+
+ // Make a 64-bit buffer, and use it to build an FILD.
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ if (SrcVT == MVT::i32) {
+ SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
+ SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
+ SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, MachinePointerInfo(),
+ false, false, 0);
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
+ OffsetSlot, MachinePointerInfo(),
+ false, false, 0);
+ SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ return Fild;
+ }
+
+ assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, MachinePointerInfo(),
+ false, false, 0);
+ // For i64 source, we need to add the appropriate power of 2 if the input
+ // was negative. This is the same as the optimization in
+ // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+ // we must be careful to do the computation in x87 extended precision, not
+ // in SSE. (The generic code can't know it's OK to do this, or how to.)
+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOLoad, 8, 8);
+
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+ MVT::i64, MMO);
+
+ APInt FF(32, 0x5F800000ULL);
+
+ // Check whether the sign bit is set.
+ SDValue SignSet = DAG.getSetCC(
+ dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+ Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+
+ // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+ SDValue FudgePtr = DAG.getConstantPool(
+ ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
+
+ // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+ SDValue Zero = DAG.getIntPtrConstant(0, dl);
+ SDValue Four = DAG.getIntPtrConstant(4, dl);
+ SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+ Zero, Four);
+ FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
+
+ // Load the value out, extending it from f32 to f80.
+ // FIXME: Avoid the extend by constructing the right constant pool?
+ SDValue Fudge = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+ false, false, false, 4);
+ // Extend everything to 80 bits to force it to be done on x87.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+ return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
+std::pair<SDValue,SDValue>
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool IsSigned, bool IsReplace) const {
+ SDLoc DL(Op);
+
+ EVT DstTy = Op.getValueType();
+ EVT TheVT = Op.getOperand(0).getValueType();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return std::make_pair(SDValue(), SDValue());
+ }
+
+ // If using FIST to compute an unsigned i64, we'll need some fixup
+ // to handle values above the maximum signed i64. A FIST is always
+ // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+ bool UnsignedFixup = !IsSigned &&
+ DstTy == MVT::i64 &&
+ (!Subtarget->is64Bit() ||
+ !isScalarFPTypeInSSEReg(TheVT));
+
+ if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+ // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+ // The low 32 bits of the fist result will have the correct uint32 result.
+ assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+ DstTy = MVT::i64;
+ }
+
+ assert(DstTy.getSimpleVT() <= MVT::i64 &&
+ DstTy.getSimpleVT() >= MVT::i16 &&
+ "Unknown FP_TO_INT to lower!");
+
+ // These are really Legal.
+ if (DstTy == MVT::i32 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+ if (Subtarget->is64Bit() &&
+ DstTy == MVT::i64 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+
+ // We lower FP->int64 into FISTP64 followed by a load from a temporary
+ // stack slot.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = DstTy.getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+
+ unsigned Opc;
+ switch (DstTy.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
+
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Value = Op.getOperand(0);
+ SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+ if (UnsignedFixup) {
+ //
+ // Conversion to unsigned i64 is implemented with a select,
+ // depending on whether the source value fits in the range
+ // of a signed i64. Let Thresh be the FP equivalent of
+ // 0x8000000000000000ULL.
+ //
+ // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+ // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
+ // Fist-to-mem64 FistSrc
+ // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+ // to XOR'ing the high 32 bits with Adjust.
+ //
+ // Being a power of 2, Thresh is exactly representable in all FP formats.
+ // For X87 we'd like to use the smallest FP type for this constant, but
+ // for DAG type consistency we have to match the FP operand type.
+
+ APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
+ LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+ bool LosesInfo = false;
+ if (TheVT == MVT::f64)
+ // The rounding mode is irrelevant as the conversion should be exact.
+ Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+ &LosesInfo);
+ else if (TheVT == MVT::f80)
+ Status = Thresh.convert(APFloat::x87DoubleExtended,
+ APFloat::rmNearestTiesToEven, &LosesInfo);
+
+ assert(Status == APFloat::opOK && !LosesInfo &&
+ "FP conversion should have been exact");
+
+ SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+ SDValue Cmp = DAG.getSetCC(DL,
+ getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT),
+ Value, ThreshVal, ISD::SETLT);
+ Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0x80000000, DL, MVT::i32));
+ SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+ Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT),
+ Value, ThreshVal, ISD::SETLT);
+ Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+ }
+
+ // FIXME This causes a redundant load/store if the SSE-class value is already
+ // in memory, such as if it is on the callstack.
+ if (isScalarFPTypeInSSEReg(TheVT)) {
+ assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+ MachinePointerInfo::getFixedStack(MF, SSFI), false,
+ false, 0);
+ SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+ SDValue Ops[] = {
+ Chain, StackSlot, DAG.getValueType(TheVT)
+ };
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOLoad, MemSize, MemSize);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+ Chain = Value.getValue(1);
+ SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ }
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOStore, MemSize, MemSize);
+
+ if (UnsignedFixup) {
+
+ // Insert the FIST, load its result as two i32's,
+ // and XOR the high i32 with Adjust.
+
+ SDValue FistOps[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ FistOps, DstTy, MMO);
+
+ SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
+ MachinePointerInfo(),
+ false, false, false, 0);
+ SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
+ DAG.getConstant(4, DL, PtrVT));
+
+ SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
+ MachinePointerInfo(),
+ false, false, false, 0);
+ High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
+
+ if (Subtarget->is64Bit()) {
+ // Join High32 and Low32 into a 64-bit result.
+ // (High32 << 32) | Low32
+ Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+ High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+ High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+ DAG.getConstant(32, DL, MVT::i8));
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+ return std::make_pair(Result, SDValue());
+ }
+
+ SDValue ResultOps[] = { Low32, High32 };
+
+ SDValue pair = IsReplace
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+ : DAG.getMergeValues(ResultOps, DL);
+ return std::make_pair(pair, SDValue());
+ } else {
+ // Build the FP_TO_INT*_IN_MEM
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ Ops, DstTy, MMO);
+ return std::make_pair(FIST, StackSlot);
+ }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
+
+ // Optimize vectors in AVX mode:
+ //
+ // v8i16 -> v8i32
+ // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
+ // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
+ // Concat upper and lower parts.
+ //
+ // v4i32 -> v4i64
+ // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
+ // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
+ // Concat upper and lower parts.
+ //
+
+ if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+ ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+ ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+ return SDValue();
+
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+
+ SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+ SDValue Undef = DAG.getUNDEF(InVT);
+ bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+ SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+ MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getBitcast(HVT, OpLo);
+ OpHi = DAG.getBitcast(HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc DL(Op);
+ unsigned int NumElts = VT.getVectorNumElements();
+ if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+ return SDValue();
+
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+ assert(InVT.getVectorElementType() == MVT::i1);
+ MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+ SDValue One =
+ DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
+ SDValue Zero =
+ DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
+
+ SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+ if (VT.is512BitVector())
+ return V;
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (Subtarget->hasFp256())
+ if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+ return Res;
+
+ return SDValue();
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+ return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
+
+ if (Subtarget->hasFp256())
+ if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+ return Res;
+
+ assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+ VT.getVectorNumElements() != SVT.getVectorNumElements());
+ return SDValue();
+}
+
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+
+ // Shift LSB to MSB and use VPMOVB2M - SKX.
+ unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+ if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
+ Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M
+ ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
+ Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
+ // Shift packed bytes not supported natively, bitcast to dword
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+ if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
+ Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M
+ ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
+ Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+
+ // Shift LSB to MSB, extend if necessary and use TESTM.
+ unsigned NumElts = InVT.getVectorNumElements();
+ if (InVT.getSizeInBits() < 512 &&
+ (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
+ !Subtarget->hasVLX())) {
+ assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
+
+ // TESTD/Q should be used (if BW supported we use CVT2MASK above),
+ // so vector should be extended to packed dword/qword.
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+ InVT = ExtVT;
+ ShiftInx = InVT.getScalarSizeInBits() - 1;
+ }
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ if (VT == MVT::i1) {
+ assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+ "Invalid scalar TRUNCATE operation");
+ if (InVT.getSizeInBits() >= 32)
+ return SDValue();
+ In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+ }
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+ "Invalid TRUNCATE operation");
+
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerTruncateVecI1(Op, DAG, Subtarget);
+
+ // vpmovqb/w/d, vpmovdb/w, vpmovwb
+ if (Subtarget->hasAVX512()) {
+ // word to byte only under BWI
+ if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT,
+ DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+ }
+ if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+ // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+ if (Subtarget->hasInt256()) {
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+ In = DAG.getBitcast(MVT::v8i32, In);
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+ ShufMask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(2, DL));
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+ static const int ShufMask[] = {0, 2, 4, 6};
+ return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+ }
+
+ if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+ // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+ if (Subtarget->hasInt256()) {
+ In = DAG.getBitcast(MVT::v32i8, In);
+
+ SmallVector<SDValue,32> pshufbMask;
+ for (unsigned i = 0; i < 2; ++i) {
+ pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
+ for (unsigned j = 0; j < 8; ++j)
+ pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
+ In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+ In = DAG.getBitcast(MVT::v4i64, In);
+
+ static const int ShufMask[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
+ &ShufMask[0]);
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getBitcast(VT, In);
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(0, DL));
+
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4, DL));
+
+ OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+ OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
+
+ // The PSHUFB mask:
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+
+ // The MOVLHPS Mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+ return DAG.getBitcast(MVT::v8i16, res);
+ }
+
+ // Handle truncation of V256 to V128 using shuffles.
+ if (!VT.is128BitVector() || !InVT.is256BitVector())
+ return SDValue();
+
+ assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+
+ SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+ // Prepare truncation shuffle mask
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec[i] = i * 2;
+ SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
+ DAG.getUNDEF(NVT), &MaskVec[0]);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(!Op.getSimpleValueType().isVector());
+
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ /*IsSigned=*/ true, /*IsReplace=*/ false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (!FIST.getNode())
+ return Op;
+
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // The node is the result.
+ return FIST;
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ /*IsSigned=*/ false, /*IsReplace=*/ false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (!FIST.getNode())
+ return Op;
+
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // The node is the result.
+ return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+ In, DAG.getUNDEF(SVT)));
+}
+
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+ assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+ "Wrong opcode for lowering FABS or FNEG.");
+
+ bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+ // If this is a FABS and it has an FNEG user, bail out to fold the combination
+ // into an FNABS. We'll lower the FABS after that if it is still in use.
+ if (IsFABS)
+ for (SDNode *User : Op->uses())
+ if (User->getOpcode() == ISD::FNEG)
+ return Op;
+
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ bool IsF128 = (VT == MVT::f128);
+
+ // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+ // decide if we should generate a 16-byte constant mask when we only need 4 or
+ // 8 bytes for the scalar case.
+
+ MVT LogicVT;
+ MVT EltVT;
+ unsigned NumElts;
+
+ if (VT.isVector()) {
+ LogicVT = VT;
+ EltVT = VT.getVectorElementType();
+ NumElts = VT.getVectorNumElements();
+ } else if (IsF128) {
+ // SSE instructions are used for optimized f128 logical operations.
+ LogicVT = MVT::f128;
+ EltVT = VT;
+ NumElts = 1;
+ } else {
+ // There are no scalar bitwise logical SSE/AVX instructions, so we
+ // generate a 16-byte vector constant and logic op even for the scalar case.
+ // Using a 16-byte mask allows folding the load of the mask with
+ // the logic op, so it can save (~4 bytes) on code size.
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ EltVT = VT;
+ NumElts = (VT == MVT::f64) ? 2 : 4;
+ }
+
+ unsigned EltBits = EltVT.getSizeInBits();
+ LLVMContext *Context = DAG.getContext();
+ // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+ APInt MaskElt =
+ IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+ Constant *C = ConstantInt::get(*Context, MaskElt);
+ C = ConstantVector::getSplat(NumElts, C);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+ unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+ SDValue Mask =
+ DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, Alignment);
+
+ SDValue Op0 = Op.getOperand(0);
+ bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+ unsigned LogicOp =
+ IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+ SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+
+ if (VT.isVector() || IsF128)
+ return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+ // For the scalar case extend to a 128-bit vector, perform the logic op,
+ // and extract the scalar result back out.
+ Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+ SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LLVMContext *Context = DAG.getContext();
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT SrcVT = Op1.getSimpleValueType();
+ bool IsF128 = (VT == MVT::f128);
+
+ // If second operand is smaller, extend it first.
+ if (SrcVT.bitsLT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+ SrcVT = VT;
+ }
+ // And if it is bigger, shrink it first.
+ if (SrcVT.bitsGT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
+ SrcVT = VT;
+ }
+
+ // At this point the operands and the result should have the same
+ // type, and that won't be f80 since that is not custom lowered.
+ assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+ "Unexpected type in LowerFCOPYSIGN");
+
+ const fltSemantics &Sem =
+ VT == MVT::f64 ? APFloat::IEEEdouble :
+ (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
+ const unsigned SizeInBits = VT.getSizeInBits();
+
+ SmallVector<Constant *, 4> CV(
+ VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
+ ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
+
+ // First, clear all bits but the sign bit from the second operand (sign).
+ CV[0] = ConstantFP::get(*Context,
+ APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
+ Constant *C = ConstantVector::get(CV);
+ auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
+
+ // Perform all logic operations as 16-byte vectors because there are no
+ // scalar FP logic instructions in SSE. This allows load folding of the
+ // constants into the logic instructions.
+ MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
+ SDValue Mask1 =
+ DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
+ if (!IsF128)
+ Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
+
+ // Next, clear the sign bit from the first operand (magnitude).
+ // If it's a constant, we can clear it here.
+ if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+ APFloat APF = Op0CN->getValueAPF();
+ // If the magnitude is a positive zero, the sign bit alone is enough.
+ if (APF.isPosZero())
+ return IsF128 ? SignBit :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+ DAG.getIntPtrConstant(0, dl));
+ APF.clearSign();
+ CV[0] = ConstantFP::get(*Context, APF);
+ } else {
+ CV[0] = ConstantFP::get(
+ *Context,
+ APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
+ }
+ C = ConstantVector::get(CV);
+ CPIdx = DAG.getConstantPool(C, PtrVT, 16);
+ SDValue Val =
+ DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (!isa<ConstantFPSDNode>(Op0)) {
+ if (!IsF128)
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+ Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+ }
+ // OR the magnitude value with the sign bit.
+ Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+ return IsF128 ? Val :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+ SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+ DAG.getConstant(1, dl, VT));
+ return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
+}
+
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ if (!Op->hasOneUse())
+ return SDValue();
+
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+
+ SmallVector<SDValue, 8> Opnds;
+ DenseMap<SDValue, unsigned> VecInMap;
+ SmallVector<SDValue, 8> VecIns;
+ EVT VT = MVT::Other;
+
+ // Recognize a special case where a vector is casted into wide integer to
+ // test all 0s.
+ Opnds.push_back(N->getOperand(0));
+ Opnds.push_back(N->getOperand(1));
+
+ for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+ SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+ // BFS traverse all OR'd operands.
+ if (I->getOpcode() == ISD::OR) {
+ Opnds.push_back(I->getOperand(0));
+ Opnds.push_back(I->getOperand(1));
+ // Re-evaluate the number of nodes to be traversed.
+ e += 2; // 2 more nodes (LHS and RHS) are pushed.
+ continue;
+ }
+
+ // Quit if a non-EXTRACT_VECTOR_ELT
+ if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Quit if without a constant index.
+ SDValue Idx = I->getOperand(1);
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ SDValue ExtractedFromVec = I->getOperand(0);
+ DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+ if (M == VecInMap.end()) {
+ VT = ExtractedFromVec.getValueType();
+ // Quit if not 128/256-bit vector.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+ // Quit if not the same type.
+ if (VecInMap.begin() != VecInMap.end() &&
+ VT != VecInMap.begin()->first.getValueType())
+ return SDValue();
+ M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+ VecIns.push_back(ExtractedFromVec);
+ }
+ M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ }
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Not extracted from 128-/256-bit vector.");
+
+ unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+
+ for (DenseMap<SDValue, unsigned>::const_iterator
+ I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+ // Quit if not all elements are used.
+ if (I->second != FullMask)
+ return SDValue();
+ }
+
+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+ // Cast all vectors into TestVT for PTEST.
+ for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+ VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+
+ // If more than one full vectors are evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is only
+ // 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+ }
+
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+ VecIns.back(), VecIns.back());
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
+ SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::i1) {
+ SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+ DAG.getConstant(0, dl, MVT::i8));
+ }
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO: {
+ // Check if we really need to set the
+ // Overflow flag. If NoSignedWrap is present
+ // that is not actually needed.
+ switch (Op->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SHL: {
+ const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
+ if (BinNode->Flags.hasNoSignedWrap())
+ break;
+ }
+ default:
+ NeedOF = true;
+ break;
+ }
+ break;
+ }
+ }
+ // See if we can use the EFLAGS value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+ // Emit a CMP with 0, which is the TEST pattern.
+ //if (Op.getValueType() == MVT::i1)
+ // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
+ // DAG.getConstant(0, MVT::i1));
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, Op.getValueType()));
+ }
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+
+ // Truncate operations may prevent the merge of the SETCC instruction
+ // and the arithmetic instruction before it. Attempt to truncate the operands
+ // of the arithmetic instruction and use a reduced bit-width instruction.
+ bool NeedTruncation = false;
+ SDValue ArithOp = Op;
+ if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+ SDValue Arith = Op->getOperand(0);
+ // Both the trunc and the arithmetic op need to have one user each.
+ if (Arith->hasOneUse())
+ switch (Arith.getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ NeedTruncation = true;
+ ArithOp = Arith;
+ }
+ }
+ }
+
+ // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+ // which may be the result of a CAST. We use the variable 'Op', which is the
+ // non-casted variable when we check for possible users.
+ switch (ArithOp.getOpcode()) {
+ case ISD::ADD:
+ // Due to an isel shortcoming, be conservative if this add is likely to be
+ // selected as part of a load-modify-store instruction. When the root node
+ // in a match is a store, isel doesn't know how to remap non-chain non-flag
+ // uses of other nodes in the match, such as the ADD in this case. This
+ // leads to the ADD being left around and reselected, with the result being
+ // two adds in the output. Alas, even if none our users are stores, that
+ // doesn't prove we're O.K. Ergo, if we have any parents that aren't
+ // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
+ // climbing the DAG back to the root, and it doesn't seem to be worth the
+ // effort.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() != ISD::CopyToReg &&
+ UI->getOpcode() != ISD::SETCC &&
+ UI->getOpcode() != ISD::STORE)
+ goto default_case;
+
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
+ // An add of one will be selected as an INC.
+ if (C->isOne() && !Subtarget->slowIncDec()) {
+ Opcode = X86ISD::INC;
+ NumOperands = 1;
+ break;
+ }
+
+ // An add of negative one (subtract of one) will be selected as a DEC.
+ if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
+ Opcode = X86ISD::DEC;
+ NumOperands = 1;
+ break;
+ }
+ }
+
+ // Otherwise use a regular EFLAGS-setting add.
+ Opcode = X86ISD::ADD;
+ NumOperands = 2;
+ break;
+ case ISD::SHL:
+ case ISD::SRL:
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+ isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned ShAmt = Op->getConstantOperandVal(1);
+ if (ShAmt >= BitWidth) // Avoid undefined shifts.
+ break;
+ APInt Mask = ArithOp.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+ : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ break;
+ SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, dl, VT));
+ DAG.ReplaceAllUsesWith(Op, New);
+ Op = New;
+ }
+ break;
+
+ case ISD::AND:
+ // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op))
+ break;
+ // FALL THROUGH
+ case ISD::SUB:
+ case ISD::OR:
+ case ISD::XOR:
+ // Due to the ISEL shortcoming noted above, be conservative if this op is
+ // likely to be selected as part of a load-modify-store instruction.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() == ISD::STORE)
+ goto default_case;
+
+ // Otherwise use a regular EFLAGS-setting instruction.
+ switch (ArithOp.getOpcode()) {
+ default: llvm_unreachable("unexpected operator!");
+ case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::XOR: Opcode = X86ISD::XOR; break;
+ case ISD::AND: Opcode = X86ISD::AND; break;
+ case ISD::OR: {
+ if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
+ if (EFLAGS.getNode())
+ return EFLAGS;
+ }
+ Opcode = X86ISD::OR;
+ break;
+ }
+ }
+
+ NumOperands = 2;
+ break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ return SDValue(Op.getNode(), 1);
+ default:
+ default_case:
+ break;
+ }
+
+ // If we found that truncation is beneficial, perform the truncation and
+ // update 'Op'.
+ if (NeedTruncation) {
+ EVT VT = Op.getValueType();
+ SDValue WideVal = Op->getOperand(0);
+ EVT WideVT = WideVal.getValueType();
+ unsigned ConvertedOp = 0;
+ // Use a target machine opcode to prevent further DAGCombine
+ // optimizations that may separate the arithmetic operations
+ // from the setcc node.
+ switch (WideVal.getOpcode()) {
+ default: break;
+ case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+ case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+ case ISD::AND: ConvertedOp = X86ISD::AND; break;
+ case ISD::OR: ConvertedOp = X86ISD::OR; break;
+ case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+ }
+
+ if (ConvertedOp) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+ SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+ SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+ Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+ }
+ }
+ }
+
+ if (Opcode == 0)
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, Op.getValueType()));
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
+
+ SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
+ DAG.ReplaceAllUsesWith(Op, New);
+ return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SDLoc dl, SelectionDAG &DAG) const {
+ if (isNullConstant(Op1))
+ return EmitTest(Op0, X86CC, dl, DAG);
+
+ assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+ "Unexpected comparison operation for MVT::i1 operands");
+
+ if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+ Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+ // Do the comparison at i32 if it's smaller, besides the Atom case.
+ // This avoids subregister aliasing issues. Keep the smaller reference
+ // if we're optimizing for size, however, as that'll allow better folding
+ // of memory operations.
+ if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+ !DAG.getMachineFunction().getFunction()->optForMinSize() &&
+ !Subtarget->isAtom()) {
+ unsigned ExtendOp =
+ isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+ }
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+ Op0, Op1);
+ return SDValue(Sub.getNode(), 1);
+ }
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+ SelectionDAG &DAG) const {
+ // If the subtarget does not support the FUCOMI instruction, floating-point
+ // comparisons have to be converted.
+ if (Subtarget->hasCMov() ||
+ Cmp.getOpcode() != X86ISD::CMP ||
+ !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ return Cmp;
+
+ // The instruction selector will select an FUCOM instruction instead of
+ // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+ // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ SDLoc dl(Cmp);
+ SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+ SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+ DAG.getConstant(8, dl, MVT::i8));
+ SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+ // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+ assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
+ return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ EVT VT = Op.getValueType();
+ const char *RecipOp;
+
+ // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if (VT == MVT::f32 && Subtarget->hasSSE1())
+ RecipOp = "sqrtf";
+ else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget->hasAVX()))
+ RecipOp = "vec-sqrtf";
+ else
+ return SDValue();
+
+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+ if (!Recips.isEnabled(RecipOp))
+ return SDValue();
+
+ RefinementSteps = Recips.getRefinementSteps(RecipOp);
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const {
+ EVT VT = Op.getValueType();
+ const char *RecipOp;
+
+ // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // reciprocal estimate with refinement on x86 prior to FMA requires
+ // 15 instructions: convert to single, rcpss, convert back to double, refine
+ // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if (VT == MVT::f32 && Subtarget->hasSSE1())
+ RecipOp = "divf";
+ else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget->hasAVX()))
+ RecipOp = "vec-divf";
+ else
+ return SDValue();
+
+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+ if (!Recips.isEnabled(RecipOp))
+ return SDValue();
+
+ RefinementSteps = Recips.getRefinementSteps(RecipOp);
+ return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+}
+
+/// If we have at least two divisions that use the same divisor, convert to
+/// multplication by a reciprocal. This may need to be adjusted for a given
+/// CPU if a division's cost is not at least twice the cost of a multiplication.
+/// This is because we still need one division to calculate the reciprocal and
+/// then we need two multiplies by that reciprocal as replacements for the
+/// original divisions.
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+ return 2;
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
+ SDLoc dl, SelectionDAG &DAG) const {
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ SDValue LHS, RHS;
+ if (Op1.getOpcode() == ISD::SHL)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() == ISD::SHL) {
+ if (isOneConstant(Op0.getOperand(0))) {
+ // If we looked past a truncate, check that it's only truncating away
+ // known zeros.
+ unsigned BitWidth = Op0.getValueSizeInBits();
+ unsigned AndBitWidth = And.getValueSizeInBits();
+ if (BitWidth > AndBitWidth) {
+ APInt Zeros, Ones;
+ DAG.computeKnownBits(Op0, Zeros, Ones);
+ if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+ return SDValue();
+ }
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
+ }
+ } else if (Op1.getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ uint64_t AndRHSVal = AndRHS->getZExtValue();
+ SDValue AndLHS = Op0;
+
+ if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+
+ // Use BT if the immediate can't be encoded in a TEST instruction.
+ if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+ LHS = AndLHS;
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+ }
+ }
+
+ if (LHS.getNode()) {
+ // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (LHS.getValueType() == MVT::i8 ||
+ LHS.getValueType() == MVT::i16)
+ LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (LHS.getValueType() != RHS.getValueType())
+ RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+ SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+ X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, dl, MVT::i8), BT);
+ }
+
+ return SDValue();
+}
+
+/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
+/// mask CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+ SDValue &Op1) {
+ unsigned SSECC;
+ bool Swap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true; // Fallthrough
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true; // Fallthrough
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ:
+ case ISD::SETONE: SSECC = 8; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ return SSECC;
+}
+
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDLoc dl(Op);
+ SDValue CC = Op.getOperand(2);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+ // Issue the operation on the smaller types and concatenate the result back
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected type for boolean compare operation");
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
+ DAG.getConstant(-1, dl, VT));
+ SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
+ DAG.getConstant(-1, dl, VT));
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETEQ:
+ // (x == y) -> ~(x ^ y)
+ return DAG.getNode(ISD::XOR, dl, VT,
+ DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
+ DAG.getConstant(-1, dl, VT));
+ case ISD::SETNE:
+ // (x != y) -> (x ^ y)
+ return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ // (x > y) -> (x & ~y)
+ return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
+ case ISD::SETULT:
+ case ISD::SETLT:
+ // (x < y) -> (~x & y)
+ return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
+ case ISD::SETULE:
+ case ISD::SETLE:
+ // (x <= y) -> (~x | y)
+ return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
+ case ISD::SETUGE:
+ case ISD::SETGE:
+ // (x >=y) -> (x | ~y)
+ return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
+ }
+}
+
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
+ Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Cannot set masked compare for this operation");
+
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ unsigned Opc = 0;
+ bool Unsigned = false;
+ bool Swap = false;
+ unsigned SSECC;
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
+ case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+ case ISD::SETLT: Swap = true; //fall-through
+ case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
+ case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+ case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+ case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
+ case ISD::SETULE: Unsigned = true; //fall-through
+ case ISD::SETLE: SSECC = 2; break;
+ }
+
+ if (Swap)
+ std::swap(Op0, Op1);
+ if (Opc)
+ return DAG.getNode(Opc, dl, VT, Op0, Op1);
+ Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, dl, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1. If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
+{
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+ if (!BV)
+ return SDValue();
+
+ MVT VT = Op1.getSimpleValueType();
+ MVT EVT = VT.getVectorElementType();
+ unsigned n = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ULTOp1;
+
+ for (unsigned i = 0; i < n; ++i) {
+ ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+ if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
+ return SDValue();
+
+ // Avoid underflow.
+ APInt Val = Elt->getAPIntValue();
+ if (Val == 0)
+ return SDValue();
+
+ ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
+ }
+
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+ SDLoc dl(Op);
+
+ if (isFP) {
+#ifndef NDEBUG
+ MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+ unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+ unsigned Opc = X86ISD::CMPP;
+ if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+ assert(VT.getVectorNumElements() <= 16);
+ Opc = X86ISD::CMPM;
+ }
+ // In the two special cases we can't handle, emit two comparisons.
+ if (SSECC == 8) {
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
+ if (SetCCOpcode == ISD::SETUEQ) {
+ CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+ } else {
+ assert(SetCCOpcode == ISD::SETONE);
+ CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+ }
+
+ SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CC0, dl, MVT::i8));
+ SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CC1, dl, MVT::i8));
+ return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ }
+ // Handle all other FP comparisons here.
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, dl, MVT::i8));
+ }
+
+ MVT VTOp0 = Op0.getSimpleValueType();
+ assert(VTOp0 == Op1.getSimpleValueType() &&
+ "Expected operands with same type!");
+ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+ "Invalid number of packed elements for source and destination!");
+
+ if (VT.is128BitVector() && VTOp0.is256BitVector()) {
+ // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
+ // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
+ // legalizer firstly checks if the first operand in input to the setcc has
+ // a legal type. If so, then it promotes the return type to that same type.
+ // Otherwise, the return type is promoted to the 'next legal type' which,
+ // for a vector of MVT::i1 is always a 128-bit integer vector type.
+ //
+ // We reach this code only if the following two conditions are met:
+ // 1. Both return type and operand type have been promoted to wider types
+ // by the type legalizer.
+ // 2. The original operand type has been promoted to a 256-bit vector.
+ //
+ // Note that condition 2. only applies for AVX targets.
+ SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+ return DAG.getZExtOrTrunc(NewOp, dl, VT);
+ }
+
+ // The non-AVX512 code below works under the assumption that source and
+ // destination types are the same.
+ assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
+ "Value types for source and destination must be the same!");
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget->hasInt256())
+ return Lower256IntVSETCC(Op, DAG);
+
+ MVT OpVT = Op1.getSimpleValueType();
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return LowerBoolVSETCC_AVX512(Op, DAG);
+
+ bool MaskResult = (VT.getVectorElementType() == MVT::i1);
+ if (Subtarget->hasAVX512()) {
+ if (Op1.getSimpleValueType().is512BitVector() ||
+ (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
+ (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
+ return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
+
+ // In AVX-512 architecture setcc returns mask with i1 elements,
+ // But there is no compare instruction for i8 and i16 elements in KNL.
+ // We are not talking about 512-bit operands in this case, these
+ // types are illegal.
+ if (MaskResult &&
+ (OpVT.getVectorElementType().getSizeInBits() < 32 &&
+ OpVT.getVectorElementType().getSizeInBits() >= 8))
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+ }
+
+ // Lower using XOP integer comparisons.
+ if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+ VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+ // Translate compare code to XOP PCOM compare mode.
+ unsigned CmpMode = 0;
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETULT:
+ case ISD::SETLT: CmpMode = 0x00; break;
+ case ISD::SETULE:
+ case ISD::SETLE: CmpMode = 0x01; break;
+ case ISD::SETUGT:
+ case ISD::SETGT: CmpMode = 0x02; break;
+ case ISD::SETUGE:
+ case ISD::SETGE: CmpMode = 0x03; break;
+ case ISD::SETEQ: CmpMode = 0x04; break;
+ case ISD::SETNE: CmpMode = 0x05; break;
+ }
+
+ // Are we comparing unsigned or signed integers?
+ unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+ ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CmpMode, dl, MVT::i8));
+ }
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc;
+ bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+ bool Subus = false;
+
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: Invert = true;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
+ case ISD::SETLT: Swap = true;
+ case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
+ case ISD::SETGE: Swap = true;
+ case ISD::SETLE: Opc = X86ISD::PCMPGT;
+ Invert = true; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: Opc = X86ISD::PCMPGT;
+ FlipSigns = true; break;
+ case ISD::SETUGE: Swap = true;
+ case ISD::SETULE: Opc = X86ISD::PCMPGT;
+ FlipSigns = true; Invert = true; break;
+ }
+
+ // Special case: Use min/max operations for SETULE/SETUGE
+ MVT VET = VT.getVectorElementType();
+ bool hasMinMax =
+ (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+ || (Subtarget->hasSSE2() && (VET == MVT::i8));
+
+ if (hasMinMax) {
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+ case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+ }
+
+ if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+ }
+
+ bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+ if (!MinMax && hasSubus) {
+ // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+ // Op0 u<= Op1:
+ // t = psubus Op0, Op1
+ // pcmpeq t, <0..0>
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULT: {
+ // If the comparison is against a constant we can turn this into a
+ // setule. With psubus, setule does not require a swap. This is
+ // beneficial because the constant in the register is no longer
+ // destructed as the destination so it can be hoisted out of a loop.
+ // Only do this pre-AVX since vpcmp* is no longer destructive.
+ if (Subtarget->hasAVX())
+ break;
+ SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+ if (ULEOp1.getNode()) {
+ Op1 = ULEOp1;
+ Subus = true; Invert = false; Swap = false;
+ }
+ break;
+ }
+ // Psubus is better than flip-sign because it requires no inversion.
+ case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
+ case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+ }
+
+ if (Subus) {
+ Opc = X86ISD::SUBUS;
+ FlipSigns = false;
+ }
+ }
+
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Check that the operation in question is available (most are plain SSE2,
+ // but PCMPGTQ and PCMPEQQ have different requirements).
+ if (VT == MVT::v2i64) {
+ if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
+ assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations. The lower
+ // compare is always unsigned.
+ SDValue SB;
+ if (FlipSigns) {
+ SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
+ } else {
+ SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
+ SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
+ SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+ Sign, Zero, Sign, Zero);
+ }
+ Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+ // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+ // Create masks for only the low parts/high parts of the 64 bit integers.
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskLo[] = { 0, 0, 2, 2 };
+ SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+ SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+ SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+ Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+ // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+ // pcmpeqd + pshufd + pand.
+ assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ // Do the compare.
+ SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+ // Make sure the lower and upper halves are both all-ones.
+ static const int Mask[] = { 1, 0, 3, 2 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+ Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getBitcast(VT, Result);
+ }
+ }
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ MVT EltVT = VT.getVectorElementType();
+ SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+ VT);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ if (MinMax)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+ if (Subus)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+ getZeroVector(VT, Subtarget, DAG, dl));
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+ assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+ && "SetCC type must be 8-bit or 1-bit integer");
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDLoc dl(Op);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+ isNullConstant(Op1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+ return NewSetCC;
+ }
+ }
+
+ // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
+ // these.
+ if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ // If the input is a setcc, then reuse the input setcc or use a new one with
+ // the inverted condition.
+ if (Op0.getOpcode() == X86ISD::SETCC) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
+ if (!Invert)
+ return Op0;
+
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(CCode, dl, MVT::i8),
+ Op0.getOperand(1));
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ return SetCC;
+ }
+ }
+ if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+ return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+ }
+
+ bool isFP = Op1.getSimpleValueType().isFloatingPoint();
+ unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
+ if (X86CC == X86::COND_INVALID)
+ return SDValue();
+
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ return SetCC;
+}
+
+SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+ X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+ assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+ SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+ SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+ return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
+ DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getNode()->getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+ Opc == X86ISD::SAHF)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == X86ISD::ADD ||
+ Opc == X86ISD::SUB ||
+ Opc == X86ISD::ADC ||
+ Opc == X86ISD::SBB ||
+ Opc == X86ISD::SMUL ||
+ Opc == X86ISD::UMUL ||
+ Opc == X86ISD::INC ||
+ Opc == X86ISD::DEC ||
+ Opc == X86ISD::OR ||
+ Opc == X86ISD::XOR ||
+ Opc == X86ISD::AND))
+ return true;
+
+ if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+ return true;
+
+ return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+ if (V.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue VOp0 = V.getOperand(0);
+ unsigned InBits = VOp0.getValueSizeInBits();
+ unsigned Bits = V.getValueSizeInBits();
+ return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Cond = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ SDLoc DL(Op);
+ MVT VT = Op1.getSimpleValueType();
+ SDValue CC;
+
+ // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available or VBLENDV if AVX is available.
+ // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
+ if (Cond.getOpcode() == ISD::SETCC &&
+ ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+ (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+ VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
+ SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+ int SSECC = translateX86FSETCC(
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+ if (SSECC != 8) {
+ if (Subtarget->hasAVX512()) {
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
+ DAG.getConstant(SSECC, DL, MVT::i8));
+ return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+ }
+
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
+ DAG.getConstant(SSECC, DL, MVT::i8));
+
+ // If we have AVX, we can use a variable vector select (VBLENDV) instead
+ // of 3 logic instructions for size savings and potentially speed.
+ // Unfortunately, there is no scalar form of VBLENDV.
+
+ // If either operand is a constant, don't try this. We can expect to
+ // optimize away at least one of the logic instructions later in that
+ // case, so that sequence would be faster than a variable blend.
+
+ // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+
+ if (Subtarget->hasAVX() &&
+ !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+ // Convert to vectors, do a VSELECT, and convert back to scalar.
+ // All of the conversions should be optimized away.
+
+ MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+ SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+ SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+ MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ VCmp = DAG.getBitcast(VCmpVT, VCmp);
+
+ SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ VSel, DAG.getIntPtrConstant(0, DL));
+ }
+ SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+ SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+ return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+ }
+ }
+
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+ SDValue Op1Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+ Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
+ else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+ Op1Scalar = Op1.getOperand(0);
+ SDValue Op2Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+ Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
+ else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+ Op2Scalar = Op2.getOperand(0);
+ if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+ Op1Scalar.getValueType(),
+ Cond, Op1Scalar, Op2Scalar);
+ if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, newSelect);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
+ if (VT == MVT::v4i1 || VT == MVT::v2i1) {
+ SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
+ Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
+ Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
+ Cond, Op1, Op2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
+ }
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
+
+ // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+ // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+ // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+ // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ if (Cond.getOpcode() == X86ISD::SETCC &&
+ Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+ isNullConstant(Cond.getOperand(1).getOperand(1))) {
+ SDValue Cmp = Cond.getOperand(1);
+
+ unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+ if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+ SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
+
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (isNullConstant(Y) &&
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+ DAG.getConstant(0, DL,
+ CmpOp0.getValueType()),
+ CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
+
+ Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+ SDValue Res = // Res = 0 or -1.
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
+
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
+ Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+ if (!isNullConstant(Op2))
+ Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+ return Res;
+ }
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ MVT VT = Op.getSimpleValueType();
+
+ bool IllegalFPCMov = false;
+ if (VT.isFloatingPoint() && !VT.isVector() &&
+ !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Opc == X86ISD::BT) { // FIXME
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+ addTest = false;
+ }
+
+ if (addTest) {
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
+ }
+
+ // a < b ? -1 : 0 -> RES = ~setcc_carry
+ // a < b ? 0 : -1 -> RES = setcc_carry
+ // a >= b ? -1 : 0 -> RES = setcc_carry
+ // a >= b ? 0 : -1 -> RES = ~setcc_carry
+ if (Cond.getOpcode() == X86ISD::SUB) {
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
+ unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+ if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+ (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (isNullConstant(Op1) || isNullConstant(Op2))) {
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ Cond);
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
+ return DAG.getNOT(DL, Res, Res.getValueType());
+ return Res;
+ }
+ }
+
+ // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+ // widen the cmov and push the truncate through. This avoids introducing a new
+ // branch during isel and doesn't add any extensions.
+ if (Op.getValueType() == MVT::i8 &&
+ Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ // Blacklist CopyFromReg to avoid partial register stalls.
+ T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+ }
+
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
+}
+
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ MVT VTElt = VT.getVectorElementType();
+ MVT InVTElt = InVT.getVectorElementType();
+ SDLoc dl(Op);
+
+ // SKX processor
+ if ((InVTElt == MVT::i1) &&
+ (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+ VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+ ((Subtarget->hasBWI() && VT.is512BitVector() &&
+ VTElt.getSizeInBits() <= 16)) ||
+
+ ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+ VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+
+ ((Subtarget->hasDQI() && VT.is512BitVector() &&
+ VTElt.getSizeInBits() >= 32))))
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ unsigned int NumElts = VT.getVectorNumElements();
+
+ if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+ return SDValue();
+
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+ return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ }
+
+ assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+ MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+ SDValue NegOne =
+ DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
+ ExtVT);
+ SDValue Zero =
+ DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+
+ SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ if (VT.is512BitVector())
+ return V;
+ return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
+}
+
+static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT VT = Op->getSimpleValueType(0);
+ MVT InVT = In.getSimpleValueType();
+ assert(VT.getSizeInBits() == InVT.getSizeInBits());
+
+ MVT InSVT = InVT.getVectorElementType();
+ assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+
+ SDLoc dl(Op);
+
+ // SSE41 targets can use the pmovsx* instructions directly.
+ if (Subtarget->hasSSE41())
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+ SDValue Curr = In;
+ MVT CurrVT = InVT;
+
+ // As SRAI is only available on i16/i32 types, we expand only up to i32
+ // and handle i64 separately.
+ while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
+ Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+ MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
+ CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
+ Curr = DAG.getBitcast(CurrVT, Curr);
+ }
+
+ SDValue SignExt = Curr;
+ if (CurrVT != InVT) {
+ unsigned SignExtShift =
+ CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
+ SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(SignExtShift, dl, MVT::i8));
+ }
+
+ if (CurrVT == VT)
+ return SignExt;
+
+ if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
+ SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(31, dl, MVT::i8));
+ SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
+ return DAG.getBitcast(VT, Ext);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+ return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
+
+ if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+ (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i16 || InVT != MVT::v16i8))
+ return SDValue();
+
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
+ unsigned NumElems = InVT.getVectorNumElements();
+ SDValue Undef = DAG.getUNDEF(InVT);
+
+ SmallVector<int,8> ShufMask1(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask1[i] = i;
+
+ SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+
+ SmallVector<int,8> ShufMask2(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask2[i] = i + NumElems/2;
+
+ SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+ OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT RegVT = Op.getSimpleValueType();
+ assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+ assert(RegVT.isInteger() &&
+ "We only custom lower integer vector sext loads.");
+
+ // Nothing useful we can do without SSE2 shuffles.
+ assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc dl(Ld);
+ EVT MemVT = Ld->getMemoryVT();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned RegSz = RegVT.getSizeInBits();
+
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+
+ assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+ && "Only anyext and sext are currently implemented.");
+ assert(MemVT != RegVT && "Cannot extend to the same type");
+ assert(MemVT.isVector() && "Must load a vector from memory");
+
+ unsigned NumElems = RegVT.getVectorNumElements();
+ unsigned MemSz = MemVT.getSizeInBits();
+ assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
+ // The only way in which we have a legal 256-bit vector result but not the
+ // integer 256-bit operations needed to directly lower a sextload is if we
+ // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+ // a 128-bit vector and a normal sign_extend to 256-bits that should get
+ // correctly legalized. We do this late to allow the canonical form of
+ // sextload to persist throughout the rest of the DAG combiner -- it wants
+ // to fold together any extensions it can, and so will fuse a sign_extend
+ // of an sextload into a sextload targeting a wider value.
+ SDValue Load;
+ if (MemSz == 128) {
+ // Just switch this to a normal load.
+ assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+ "it must be a legal 128-bit vector "
+ "type!");
+ Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ } else {
+ assert(MemSz < 128 &&
+ "Can't extend a type wider than 128 bits to a 256 bit vector!");
+ // Do an sext load to a 128-bit vector type. We want to use the same
+ // number of elements, but elements half as wide. This will end up being
+ // recursively lowered by this routine, but will succeed as we definitely
+ // have all the necessary features if we're using AVX1.
+ EVT HalfEltVT =
+ EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+ EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+ Load =
+ DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Ld->getAlignment());
+ }
+
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+ // Finally, do a normal sign-extend to the desired register.
+ return DAG.getSExtOrTrunc(Load, dl, RegVT);
+ }
+
+ // All sizes must be a power of two.
+ assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+ "Non-power-of-two elements are not custom lowered!");
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
+ MVT SclrLoadTy = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+ SclrLoadTy = Tp;
+ }
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+ assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+ "Can only lower sext loads with a single scalar load!");
+
+ unsigned loadRegZize = RegSz;
+ if (Ext == ISD::SEXTLOAD && RegSz >= 256)
+ loadRegZize = 128;
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
+ EVT LoadUnitVecVT = EVT::getVectorVT(
+ *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ loadRegZize / MemVT.getScalarSizeInBits());
+
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
+
+ // We can't shuffle using an illegal type.
+ assert(TLI.isTypeLegal(WideVecVT) &&
+ "We only lower types that form legal widened vector types");
+
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ // Perform a single load.
+ SDValue ScalarLoad =
+ DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
+ Ld->getAlignment());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i, dl));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+ // Bitcast the loaded value to a vector of the original element type, in
+ // the size of the target vector type.
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
+ unsigned SizeRatio = RegSz / MemSz;
+
+ if (Ext == ISD::SEXTLOAD) {
+ // If we have SSE4.1, we can directly emit a VSEXT node.
+ if (Subtarget->hasSSE41()) {
+ SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Sext;
+ }
+
+ // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+ // lanes.
+ assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+ "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
+
+ SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Shuff;
+ }
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+
+ // Bitcast to the requested type.
+ Shuff = DAG.getBitcast(RegVT, Shuff);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Shuff;
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse() &&
+ Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+ if (Op.getOpcode() != ISD::XOR)
+ return false;
+ if (isOneConstant(Op.getOperand(1)))
+ return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse();
+ return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ SDLoc dl(Op);
+ SDValue CC;
+ bool Inverted = false;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ // Check for setcc([su]{add,sub,mul}o == 0).
+ if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ isNullConstant(Cond.getOperand(1)) &&
+ Cond.getOperand(0).getResNo() == 1 &&
+ (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+ Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+ Inverted = true;
+ Cond = Cond.getOperand(0);
+ } else {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
+ }
+#if 0
+ // FIXME: LowerXALUO doesn't handle these!!
+ else if (Cond.getOpcode() == X86ISD::ADD ||
+ Cond.getOpcode() == X86ISD::SUB ||
+ Cond.getOpcode() == X86ISD::SMUL ||
+ Cond.getOpcode() == X86ISD::UMUL)
+ Cond = LowerXALUO(Cond, DAG);
+#endif
+
+ // Look pass (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+ if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+ Cond = Cmp;
+ addTest = false;
+ } else {
+ switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ default: break;
+ case X86::COND_O:
+ case X86::COND_B:
+ // These can only come from an arithmetic instruction with overflow,
+ // e.g. SADDO, UADDO.
+ Cond = Cond.getNode()->getOperand(1);
+ addTest = false;
+ break;
+ }
+ }
+ }
+ CondOpcode = Cond.getOpcode();
+ if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ // Keep this in sync with LowerXALUO, otherwise we might create redundant
+ // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+ // X86ISD::INC).
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO:
+ if (isOneConstant(RHS)) {
+ X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+ break;
+ }
+ X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO:
+ if (isOneConstant(RHS)) {
+ X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+ break;
+ }
+ X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (Inverted)
+ X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ addTest = false;
+ } else {
+ unsigned CondOpc;
+ if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+ SDValue Cmp = Cond.getOperand(0).getOperand(1);
+ if (CondOpc == ISD::OR) {
+ // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp)) {
+ CC = Cond.getOperand(0).getOperand(0);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = Cond.getOperand(1).getOperand(0);
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else { // ISD::AND
+ // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp) &&
+ Op.getNode()->hasOneUse()) {
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ }
+ } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+ // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+ // It should be transformed during dag combiner except when the condition
+ // is set by a arithmetics with overflow node.
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
+ Cond = Cond.getOperand(0).getOperand(1);
+ addTest = false;
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+ // For FCMP_OEQ, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+ CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+ // For FCMP_UNE, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_UNE.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+ CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ Dest = FalseBB;
+ }
+ }
+ }
+ }
+
+ if (addTest) {
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+ CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ Cond = EmitTest(Cond, X86Cond, dl, DAG);
+ }
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
+ return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+ bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
+ SplitStack;
+ SDLoc dl(Op);
+
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ EVT VT = Node->getValueType(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+ bool Is64Bit = Subtarget->is64Bit();
+ MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+ SDValue Result;
+ if (!Lower) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+ EVT VT = Node->getValueType(0);
+ SDValue Tmp3 = Node->getOperand(2);
+
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ unsigned StackAlign = TFI.getStackAlignment();
+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ if (Align > StackAlign)
+ Result = DAG.getNode(ISD::AND, dl, VT, Result,
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+ } else if (SplitStack) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (Is64Bit) {
+ // The 64 bit implementation of segmented stacks needs to clobber both r10
+ // r11. This makes it impossible to use it along with nested parameters.
+ const Function *F = MF.getFunction();
+
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I)
+ if (I->hasNestAttr())
+ report_fatal_error("Cannot use segmented stacks with functions that "
+ "have nested arguments.");
+ }
+
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ SDValue Flag;
+ const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
+
+ Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+ Flag = Chain.getValue(1);
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned SPReg = RegInfo->getStackRegister();
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+ Chain = SP.getValue(1);
+
+ if (Align) {
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+ }
+
+ Result = SP;
+ }
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+ SDValue Ops[2] = {Result, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ SDLoc DL(Op);
+
+ if (!Subtarget->is64Bit() ||
+ Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV), false, false, 0);
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDValue, 8> MemOps;
+ SDValue FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDValue Store = DAG.getStore(Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
+ DL, MVT::i32),
+ FIN, MachinePointerInfo(SV), false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+ Store = DAG.getStore(Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
+ MVT::i32),
+ FIN, MachinePointerInfo(SV, 4), false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+ SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
+ MachinePointerInfo(SV, 8),
+ false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+ Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
+ Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
+ SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->is64Bit() &&
+ "LowerVAARG only handles 64-bit va_arg!");
+ assert(Op.getNode()->getNumOperands() == 4);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+ // The Win64 ABI uses char* instead of a structure.
+ return DAG.expandVAArg(Op.getNode());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue SrcPtr = Op.getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ unsigned Align = Op.getConstantOperandVal(3);
+ SDLoc dl(Op);
+
+ EVT ArgVT = Op.getNode()->getValueType(0);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+ uint8_t ArgMode;
+
+ // Decide which area this value should be read from.
+ // TODO: Implement the AMD64 ABI in its entirety. This simple
+ // selection mechanism works only for the basic types.
+ if (ArgVT == MVT::f80) {
+ llvm_unreachable("va_arg for f80 not yet implemented");
+ } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+ ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
+ } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
+ } else {
+ llvm_unreachable("Unhandled argument type in LowerVAARG");
+ }
+
+ if (ArgMode == 2) {
+ // Sanity Check: Make sure using fp_offset makes sense.
+ assert(!Subtarget->useSoftFloat() &&
+ !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ Subtarget->hasSSE1());
+ }
+
+ // Insert VAARG_64 node into the DAG
+ // VAARG_64 returns two values: Variable Argument Address, Chain
+ SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
+ DAG.getConstant(ArgMode, dl, MVT::i8),
+ DAG.getConstant(Align, dl, MVT::i32)};
+ SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
+ SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+ VTs, InstOps, MVT::i64,
+ MachinePointerInfo(SV),
+ /*Align=*/0,
+ /*Volatile=*/false,
+ /*ReadMem=*/true,
+ /*WriteMem=*/true);
+ Chain = VAARG.getValue(1);
+
+ // Load the next argument and return it
+ return DAG.getLoad(ArgVT, dl,
+ Chain,
+ VAARG,
+ MachinePointerInfo(),
+ false, false, false, 0);
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+ // where a va_list is still an i8*.
+ assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+ if (Subtarget->isCallingConvWin64(
+ DAG.getMachineFunction().getFunction()->getCallingConv()))
+ // Probably a Win64 va_copy.
+ return DAG.expandVACopy(Op.getNode());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ SDLoc DL(Op);
+
+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
+ false, false,
+ MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
+ SDValue SrcOp, uint64_t ShiftAmt,
+ SelectionDAG &DAG) {
+ MVT ElementType = VT.getVectorElementType();
+
+ // Fold this packed shift into its first operand if ShiftAmt is 0.
+ if (ShiftAmt == 0)
+ return SrcOp;
+
+ // Check for ShiftAmt >= element width
+ if (ShiftAmt >= ElementType.getSizeInBits()) {
+ if (Opc == X86ISD::VSRAI)
+ ShiftAmt = ElementType.getSizeInBits() - 1;
+ else
+ return DAG.getConstant(0, dl, VT);
+ }
+
+ assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+ && "Unknown target vector shift-by-constant node");
+
+ // Fold this packed vector shift into a build vector if SrcOp is a
+ // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+ if (VT == SrcOp.getSimpleValueType() &&
+ ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ unsigned NumElts = SrcOp->getNumOperands();
+ ConstantSDNode *ND;
+
+ switch(Opc) {
+ default: llvm_unreachable(nullptr);
+ case X86ISD::VSHLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
+ }
+ break;
+ case X86ISD::VSRLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
+ }
+ break;
+ case X86ISD::VSRAI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(CurrentOp);
+ continue;
+ }
+ ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
+ }
+ break;
+ }
+
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+ }
+
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, dl, MVT::i8));
+}
+
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
+ SDValue SrcOp, SDValue ShAmt,
+ SelectionDAG &DAG) {
+ MVT SVT = ShAmt.getSimpleValueType();
+ assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
+
+ // Catch shift-by-constant.
+ if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+ return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+ CShAmt->getZExtValue(), DAG);
+
+ // Change opcode to non-immediate version
+ switch (Opc) {
+ default: llvm_unreachable("Unknown target vector shift node");
+ case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+ case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+ case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+ // Let the shuffle legalizer expand this shift amount node.
+ SDValue Op0 = ShAmt.getOperand(0);
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+ ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+ } else {
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ SmallVector<SDValue, 4> ShOps;
+ ShOps.push_back(ShAmt);
+ if (SVT == MVT::i32) {
+ ShOps.push_back(DAG.getConstant(0, dl, SVT));
+ ShOps.push_back(DAG.getUNDEF(SVT));
+ }
+ ShOps.push_back(DAG.getUNDEF(SVT));
+
+ MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+ }
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType();
+ MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+ ShAmt = DAG.getBitcast(ShVT, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, SDLoc dl) {
+
+ if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+ // Mask should be extended
+ Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+ MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+ }
+
+ if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+ if (MaskVT == MVT::v64i1) {
+ assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+ // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(0, dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ } else {
+ // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+ // and bitcast.
+ MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+ return DAG.getBitcast(MaskVT,
+ DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+ }
+
+ } else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+}
+
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ unsigned OpcodeSelect = ISD::VSELECT;
+ SDLoc dl(Op);
+
+ if (isAllOnesConstant(Mask))
+ return Op;
+
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ switch (Op.getOpcode()) {
+ default: break;
+ case X86ISD::PCMPEQM:
+ case X86ISD::PCMPGTM:
+ case X86ISD::CMPM:
+ case X86ISD::CMPMU:
+ return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ case X86ISD::VFPCLASS:
+ case X86ISD::VFPCLASSS:
+ return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
+ // We can't use ISD::VSELECT here because it is not always "Legal"
+ // for the destination type. For example vpmovqb require only AVX512
+ // and vselect that can operate on byte element type require BWI
+ OpcodeSelect = X86ISD::SELECT;
+ break;
+ }
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+}
+
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is coming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (isAllOnesConstant(Mask))
+ return Op;
+
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ // The mask should be of type MVT::i1
+ SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+ if (Op.getOpcode() == X86ISD::FSETCC)
+ return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+ if (Op.getOpcode() == X86ISD::VFPCLASS ||
+ Op.getOpcode() == X86ISD::VFPCLASSS)
+ return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
+
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+}
+
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+ if (!Fn->hasPersonalityFn())
+ report_fatal_error(
+ "querying registration node size for function without personality");
+ // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+ // WinEHStatePass for the full struct definition.
+ switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+ case EHPersonality::MSVC_X86SEH: return 24;
+ case EHPersonality::MSVC_CXX: return 16;
+ default: break;
+ }
+ report_fatal_error(
+ "can only recover FP for 32-bit MSVC EH personality functions");
+}
+
+/// When the MSVC runtime transfers control to us, either to an outlined
+/// function or when returning to a parent frame after catching an exception, we
+/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
+/// Here's the math:
+/// RegNodeBase = EntryEBP - RegNodeSize
+/// ParentFP = RegNodeBase - ParentFrameOffset
+/// Subtracting RegNodeSize takes us to the offset of the registration node, and
+/// subtracting the offset (negative on x86) takes us back to the parent FP.
+static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
+ SDValue EntryEBP) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDLoc dl;
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+ // It's possible that the parent function no longer has a personality function
+ // if the exceptional code was optimized away, in which case we just return
+ // the incoming EBP.
+ if (!Fn->hasPersonalityFn())
+ return EntryEBP;
+
+ // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
+ // registration, or the .set_setframe offset.
+ MCSymbol *OffsetSym =
+ MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
+ GlobalValue::getRealLinkageName(Fn->getName()));
+ SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
+ SDValue ParentFrameOffset =
+ DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+
+ // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+ // prologue to RBP in the parent function.
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.is64Bit())
+ return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+ int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+ // RegNodeBase = EntryEBP - RegNodeSize
+ // ParentFP = RegNodeBase - ParentFrameOffset
+ SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
+ DAG.getConstant(RegNodeSize, dl, PtrVT));
+ return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ MVT VT = Op.getSimpleValueType();
+ const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+ if (IntrData) {
+ switch(IntrData->Type) {
+ case INTR_TYPE_1OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+ case INTR_TYPE_2OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ case INTR_TYPE_2OP_IMM8:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
+ case INTR_TYPE_3OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
+ case INTR_TYPE_4OP:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
+ case INTR_TYPE_1OP_MASK_RM: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue RoundingMode;
+ // We allways add rounding mode to the Node.
+ // If the rounding mode is not specified, we add the
+ // "current direction" mode.
+ if (Op.getNumOperands() == 4)
+ RoundingMode =
+ DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ else
+ RoundingMode = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0)
+ if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(), Src, RoundingMode),
+ Mask, PassThru, Subtarget, DAG);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+ RoundingMode),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_1OP_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ // We add rounding mode to the Node when
+ // - RM Opcode is specified and
+ // - RM is not "current direction".
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src0 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ RoundingMode, Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK:
+ case INTR_TYPE_2OP_IMM8_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+ Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ // TODO: Intrinsics should have fast-math-flags to propagate.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (6 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 6)
+ Rnd = Op.getOperand(5);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_SCALAR_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ Src2, Src3, Sae),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Imm = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (7 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 7)
+ Rnd = Op.getOperand(6);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Imm, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_IMM8_MASK:
+ case INTR_TYPE_3OP_MASK:
+ case INSERT_SUBVEC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+ Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+ else if (IntrData->Type == INSERT_SUBVEC) {
+ // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+ assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+ unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+ Imm *= Src2.getSimpleValueType().getVectorNumElements();
+ Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+ }
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case VPERM_3OP_MASKZ:
+ case VPERM_3OP_MASK:{
+ // Src2 is the PassThru
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == VPERM_3OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ PassThru = DAG.getBitcast(VT, Src2);
+
+ // Swap Src1 and Src2 in the node creation
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src2, Src1, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK3:
+ case FMA_OP_MASKZ:
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == FMA_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else if (IntrData->Type == FMA_OP_MASK3)
+ PassThru = Src3;
+ else
+ PassThru = Src1;
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case TERLOG_OP_MASK:
+ case TERLOG_OP_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+ SDValue Mask = Op.getOperand(5);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = Src1;
+ // Set PassThru element.
+ if (IntrData->Type == TERLOG_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Src4),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FPCLASS: {
+ // FPclass intrinsics with mask
+ SDValue Src1 = Op.getOperand(1);
+ MVT VT = Src1.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+ SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MaskVT),
+ Subtarget, DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), FPclassMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case FPCLASSS: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+ SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
+ }
+ case CMP_MASK:
+ case CMP_MASK_CC: {
+ // Comparison intrinsics with masks.
+ // Example of transformation:
+ // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+ // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+ // (i8 (bitcast
+ // (v8i1 (insert_subvector undef,
+ // (v2i1 (and (PCMPEQM %a, %b),
+ // (extract_subvector
+ // (v8i1 (bitcast %mask)), 0))), 0))))
+ MVT VT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ SDValue Cmp;
+ if (IntrData->Type == CMP_MASK_CC) {
+ SDValue CC = Op.getOperand(3);
+ CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC);
+
+ } else {
+ assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, dl,
+ MaskVT),
+ Subtarget, DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CmpMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case CMP_MASK_SCALAR_CC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+ SDValue Mask = Op.getOperand(4);
+
+ SDValue Cmp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+ SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, dl,
+ MVT::i1),
+ Subtarget, DAG);
+
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
+ DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
+ DAG.getValueType(MVT::i1));
+ }
+ case COMI: { // Comparison intrinsics
+ ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
+ assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+ SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, dl, MVT::i8), Cond);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ case COMI_RM: { // Comparison intrinsics with Sae
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDValue CC = Op.getOperand(3);
+ SDValue Sae = Op.getOperand(4);
+ auto ComiType = TranslateX86ConstCondToX86CC(CC);
+ // choose between ordered and unordered (comi/ucomi)
+ unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+ SDValue Cond;
+ if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+ else
+ Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ case VSHIFT:
+ return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG);
+ case VSHIFT_MASK:
+ return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
+ Op.getSimpleValueType(),
+ Op.getOperand(1),
+ Op.getOperand(2), DAG),
+ Op.getOperand(4), Op.getOperand(3), Subtarget,
+ DAG);
+ case COMPRESS_EXPAND_IN_REG: {
+ SDValue Mask = Op.getOperand(3);
+ SDValue DataToCompress = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ if (isAllOnesConstant(Mask)) // return data as is
+ return Op.getOperand(1);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ DataToCompress),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case BROADCASTM: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ Mask = DAG.getBitcast(MaskVT, Mask);
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
+ }
+ case BLEND: {
+ SDValue Mask = Op.getOperand(3);
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ case KUNPCK: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ // Arguments should be swapped.
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+ Src2, Src1);
+ return DAG.getBitcast(VT, Res);
+ }
+ case CONVERT_TO_MASK: {
+ MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+ SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ Op.getOperand(1));
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CvtMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case CONVERT_MASK_TO_VEC: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+ }
+ case BRCST_SUBVEC_TO_VEC: {
+ SDValue Src = Op.getOperand(1);
+ SDValue Passthru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ EVT resVT = Passthru.getValueType();
+ SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
+ DAG.getUNDEF(resVT), Src,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue immVal;
+ if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
+ immVal = DAG.getConstant(0x44, dl, MVT::i8);
+ else
+ immVal = DAG.getConstant(0, dl, MVT::i8);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ subVec, subVec, immVal),
+ Mask, Passthru, Subtarget, DAG);
+ }
+ default:
+ break;
+ }
+ }
+
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ // Operands intentionally swapped. Mask is last operand to intrinsic,
+ // but second operand for node/instruction.
+ return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(1));
+
+ // ptest and testp intrinsics. The intrinsic these come from are designed to
+ // return an integer value, not just an instruction so lower it to the ptest
+ // or testp pattern and a setcc for the result.
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestz_256:
+ case Intrinsic::x86_avx_ptestc_256:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256: {
+ bool IsTestPacked = false;
+ unsigned X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ IsTestPacked = true; // Fallthrough
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_avx_ptestz_256:
+ // ZF = 1
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ IsTestPacked = true; // Fallthrough
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_avx_ptestc_256:
+ // CF = 1
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256:
+ IsTestPacked = true; // Fallthrough
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ // ZF and CF = 0
+ X86CC = X86::COND_A;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+ SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+ SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ case Intrinsic::x86_avx512_kortestz_w:
+ case Intrinsic::x86_avx512_kortestc_w: {
+ unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
+ SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistria128:
+ case Intrinsic::x86_sse42_pcmpestria128:
+ case Intrinsic::x86_sse42_pcmpistric128:
+ case Intrinsic::x86_sse42_pcmpestric128:
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ case Intrinsic::x86_sse42_pcmpistris128:
+ case Intrinsic::x86_sse42_pcmpestris128:
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ case Intrinsic::x86_sse42_pcmpestriz128: {
+ unsigned Opcode;
+ unsigned X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse42_pcmpistria128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpestria128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpistric128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpestric128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpistris128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpestris128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_sse42_pcmpestriz128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_E;
+ break;
+ }
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, dl, MVT::i8),
+ SDValue(PCMP.getNode(), 1));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistri128:
+ case Intrinsic::x86_sse42_pcmpestri128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+ Opcode = X86ISD::PCMPISTRI;
+ else
+ Opcode = X86ISD::PCMPESTRI;
+
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps);
+ }
+
+ case Intrinsic::x86_seh_lsda: {
+ // Compute the symbol for the LSDA. We know it'll get emitted later.
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Op1 = Op.getOperand(1);
+ auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
+ MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+ GlobalValue::getRealLinkageName(Fn->getName()));
+
+ // Generate a simple absolute symbol reference. This intrinsic is only
+ // supported on 32-bit Windows, which isn't PIC.
+ SDValue Result = DAG.getMCSymbol(LSDASym, VT);
+ return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
+ }
+
+ case Intrinsic::x86_seh_recoverfp: {
+ SDValue FnOp = Op.getOperand(1);
+ SDValue IncomingFPOp = Op.getOperand(2);
+ GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+ auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+ if (!Fn)
+ report_fatal_error(
+ "llvm.x86.seh.recoverfp must take a function as the first argument");
+ return recoverFramePointer(DAG, Fn, IncomingFPOp);
+ }
+
+ case Intrinsic::localaddress: {
+ // Returns one of the stack, base, or frame pointer registers, depending on
+ // which is used to reference local variables.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned Reg;
+ if (RegInfo->hasBasePointer(MF))
+ Reg = RegInfo->getBaseRegister();
+ else // This function handles the SP or FP case.
+ Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+ }
+ }
+}
+
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget * Subtarget) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
+ else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ if (Src.getOpcode() == ISD::UNDEF)
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
+ else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ return SDValue(Res, 1);
+}
+
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ MVT MaskVT =
+ MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
+ else
+ MaskInReg = DAG.getBitcast(MaskVT, Mask);
+ //SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+}
+
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue LO, HI;
+
+ // The ECX register is used to select the index of the performance counter
+ // to read.
+ SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+ N->getOperand(2));
+ SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+ // Reads the content of a 64-bit performance counter and returns it in the
+ // registers EDX:EAX.
+ if (Subtarget->is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ Chain = HI.getValue(1);
+
+ if (Subtarget->is64Bit()) {
+ // The EAX register is loaded with the low-order 32 bits. The EDX register
+ // is loaded with the supported high-order bits of the counter.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+ SDValue LO, HI;
+
+ // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+ // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+ // and the EAX register is loaded with the low-order 32 bits.
+ if (Subtarget->is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ SDValue Chain = HI.getValue(1);
+
+ if (Opcode == X86ISD::RDTSCP_DAG) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+ HI.getValue(2));
+ // Explicitly store the content of ECX at the location passed in input
+ // to the 'rdtscp' intrinsic.
+ Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+ MachinePointerInfo(), false, false, 0);
+ }
+
+ if (Subtarget->is64Bit()) {
+ // The EDX register is loaded with the high-order 32 bits of the MSR, and
+ // the EAX register is loaded with the low-order 32 bits.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 2> Results;
+ SDLoc DL(Op);
+ getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, DL);
+}
+
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue RegNode = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EH registrations only live in functions using WinEH");
+
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+ EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
+}
+
+/// \brief Lower intrinsics for TRUNCATE_TO_MEM case
+/// return truncate Store/MaskedStore Node
+static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
+ SelectionDAG &DAG,
+ MVT ElementType) {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToTruncate = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+
+ MVT VT = DataToTruncate.getSimpleValueType();
+ MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
+
+ if (isAllOnesConstant(Mask)) // return just a truncate store
+ return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
+ MachinePointerInfo(), SVT, false, false,
+ SVT.getScalarSizeInBits()/8);
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+
+ MachineMemOperand *MMO = DAG.getMachineFunction().
+ getMachineMemOperand(MachinePointerInfo(),
+ MachineMemOperand::MOStore, SVT.getStoreSize(),
+ SVT.getScalarSizeInBits()/8);
+
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
+ VMask, SVT, MMO, true);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+ const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+ if (!IntrData) {
+ if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+ return MarkEHRegistrationNode(Op, DAG);
+ if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
+ IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
+ IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
+ IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+ // We need a frame pointer because this will get lowered to a PUSH/POP
+ // sequence.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setHasOpaqueSPAdjustment(true);
+ // Don't do anything here, we will expand these intrinsics out later
+ // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+ return SDValue();
+ }
+ return SDValue();
+ }
+
+ SDLoc dl(Op);
+ switch(IntrData->Type) {
+ default: llvm_unreachable("Unknown Intrinsic Type");
+ case RDSEED:
+ case RDRAND: {
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+ // Otherwise return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, dl, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, dl, MVT::i32),
+ SDValue(Result.getNode(), 1) };
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+ DAG.getVTList(Op->getValueType(1), MVT::Glue),
+ Ops);
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
+ case GATHER: {
+ //gather(v1, mask, index, base, scale);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+ Chain, Subtarget);
+ }
+ case SCATTER: {
+ //scatter(base, mask, index, v1, scale);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Base = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Src = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain);
+ }
+ case PREFETCH: {
+ SDValue Hint = Op.getOperand(6);
+ unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+ assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Base = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+ }
+ // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+ case RDTSC: {
+ SmallVector<SDValue, 2> Results;
+ getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // Read Performance Monitoring Counters.
+ case RDPMC: {
+ SmallVector<SDValue, 2> Results;
+ getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // XTEST intrinsics.
+ case XTEST: {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_NE, dl, MVT::i8),
+ InTrans);
+ SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+ Ret, SDValue(InTrans.getNode(), 1));
+ }
+ // ADC/ADCX/SBB
+ case ADX: {
+ SmallVector<SDValue, 2> Results;
+ SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+ SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
+ DAG.getConstant(-1, dl, MVT::i8));
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
+ Op.getOperand(4), GenCF.getValue(1));
+ SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
+ Op.getOperand(5), MachinePointerInfo(),
+ false, false, 0);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_B, dl, MVT::i8),
+ Res.getValue(1));
+ Results.push_back(SetCC);
+ Results.push_back(Store);
+ return DAG.getMergeValues(Results, dl);
+ }
+ case COMPRESS_TO_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToCompress = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+
+ MVT VT = DataToCompress.getSimpleValueType();
+ if (isAllOnesConstant(Mask)) // return just a store
+ return DAG.getStore(Chain, dl, DataToCompress, Addr,
+ MachinePointerInfo(), false, false,
+ VT.getScalarSizeInBits()/8);
+
+ SDValue Compressed =
+ getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
+ Mask, DAG.getUNDEF(VT), Subtarget, DAG);
+ return DAG.getStore(Chain, dl, Compressed, Addr,
+ MachinePointerInfo(), false, false,
+ VT.getScalarSizeInBits()/8);
+ }
+ case TRUNCATE_TO_MEM_VI8:
+ return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
+ case TRUNCATE_TO_MEM_VI16:
+ return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
+ case TRUNCATE_TO_MEM_VI32:
+ return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
+ case EXPAND_FROM_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+
+ if (isAllOnesConstant(Mask)) // return just a load
+ return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
+ false, VT.getScalarSizeInBits()/8);
+
+ SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
+ false, false, false,
+ VT.getScalarSizeInBits()/8);
+
+ SDValue Results[] = {
+ getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
+ Mask, PassThru, Subtarget, DAG), Chain};
+ return DAG.getMergeValues(Results, dl);
+ }
+ }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT,
+ FrameAddr, Offset),
+ MachinePointerInfo(), false, false, false, 0);
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ EVT VT = Op.getValueType();
+
+ MFI->setFrameAddressIsTaken(true);
+
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+ SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ SDLoc dl(Op); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo(),
+ false, false, false, 0);
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ const MachineFunction &MF = DAG.getMachineFunction();
+
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("esp", X86::ESP)
+ .Case("rsp", X86::RSP)
+ .Case("ebp", X86::EBP)
+ .Case("rbp", X86::RBP)
+ .Default(0);
+
+ if (Reg == X86::EBP || Reg == X86::RBP) {
+ if (!TFI.hasFP(MF))
+ report_fatal_error("register " + StringRef(RegName) +
+ " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+ else {
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+ "Invalid Frame Register!");
+ }
+#endif
+ }
+
+ if (Reg)
+ return Reg;
+
+ report_fatal_error("Invalid register name global variable");
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+ SelectionDAG &DAG) const {
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
+}
+
+unsigned X86TargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+ return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+ return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+unsigned X86TargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ // Funclet personalities don't use selectors (the runtime does the selection).
+ assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+ return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc dl (Op);
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+ (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+ unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+ SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+ DAG.getIntPtrConstant(RegInfo->getSlotSize(),
+ dl));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
+ false, false, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+ return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+ DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+ return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Root = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ SDLoc dl (Op);
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ if (Subtarget->is64Bit()) {
+ SDValue OutChains[6];
+
+ // Large code-model.
+ const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
+ const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+ const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+ const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+ const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+ // Load the pointer to the nested function into R11.
+ unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+ SDValue Addr = Trmp;
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr),
+ false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(2, dl, MVT::i64));
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+ MachinePointerInfo(TrmpAddr, 2),
+ false, false, 2);
+
+ // Load the 'nest' parameter value into R10.
+ // R10 is specified in X86CallingConv.td
+ OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(10, dl, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 10),
+ false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(12, dl, MVT::i64));
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 12),
+ false, false, 2);
+
+ // Jump to the nested function.
+ OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(20, dl, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 20),
+ false, false, 0);
+
+ unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(22, dl, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 22),
+ false, false, 0);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ } else {
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ CallingConv::ID CC = Func->getCallingConv();
+ unsigned NestReg;
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::X86_StdCall: {
+ // Pass 'nest' parameter in ECX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::ECX;
+
+ // Check that ECX wasn't needed by an 'inreg' parameter.
+ FunctionType *FTy = Func->getFunctionType();
+ const AttributeSet &Attrs = Func->getAttributes();
+
+ if (!Attrs.isEmpty() && !Func->isVarArg()) {
+ unsigned InRegCount = 0;
+ unsigned Idx = 1;
+
+ for (FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end(); I != E; ++I, ++Idx)
+ if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+ auto &DL = DAG.getDataLayout();
+ // FIXME: should only count parameters that are lowered to integers.
+ InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+ }
+
+ if (InRegCount > 2) {
+ report_fatal_error("Nest register in use - reduce number of inreg"
+ " parameters!");
+ }
+ }
+ break;
+ }
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::Fast:
+ // Pass 'nest' parameter in EAX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::EAX;
+ break;
+ }
+
+ SDValue OutChains[4];
+ SDValue Addr, Disp;
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(10, dl, MVT::i32));
+ Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+ // This is storing the opcode for MOV32ri.
+ const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+ const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+ OutChains[0] = DAG.getStore(Root, dl,
+ DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
+ Trmp, MachinePointerInfo(TrmpAddr),
+ false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(1, dl, MVT::i32));
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 1),
+ false, false, 1);
+
+ const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(5, dl, MVT::i32));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 5),
+ false, false, 1);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(6, dl, MVT::i32));
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+ MachinePointerInfo(TrmpAddr, 6),
+ false, false, 1);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ /*
+ The rounding mode is in bits 11:10 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to -inf
+ 10 Round to +inf
+ 11 Round to 0
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we do:
+ (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+
+ // Save FP Control Word to stack slot
+ int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+ SDValue StackSlot =
+ DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOStore, 2, 2);
+
+ SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+ SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, MVT::i16, MMO);
+
+ // Load FP Control Word from stack slot
+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
+ MachinePointerInfo(), false, false, false, 0);
+
+ // Transform as necessary
+ SDValue CWD1 =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0x800, DL, MVT::i16)),
+ DAG.getConstant(11, DL, MVT::i8));
+ SDValue CWD2 =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+ DAG.getConstant(9, DL, MVT::i8));
+
+ SDValue RetVal =
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ DAG.getNode(ISD::ADD, DL, MVT::i16,
+ DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+ DAG.getConstant(1, DL, MVT::i16)),
+ DAG.getConstant(3, DL, MVT::i16));
+
+ return DAG.getNode((VT.getSizeInBits() < 16 ?
+ ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+// to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+ // Extend to 512 bit vector.
+ assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unsupported value type for operation");
+
+ MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+ SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+ DAG.getUNDEF(NewVT),
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ "Unsupported element type");
+
+ if (16 < NumElems) {
+ // Split vector, it's Lo and Hi parts will be handled in next iteration.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+ MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
+ Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ }
+
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+ assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+ "Unsupported value type for operation");
+
+ // Use native supported vector instruction vplzcntd.
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+ SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+ SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+ return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+
+ if (VT.isVector() && Subtarget->hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)
+ };
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+
+ // Finally xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+ DAG.getConstant(NumBits - 1, dl, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ EVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse).
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // And xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+ DAG.getConstant(NumBits - 1, dl, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumBits = VT.getScalarSizeInBits();
+ SDLoc dl(Op);
+
+ if (VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ SDValue N0 = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+
+ // lsb(x) = (x & -x)
+ SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+ DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+ // cttz_undef(x) = (width - 1) - ctlz(lsb)
+ if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+ TLI.isOperationLegal(ISD::CTLZ, VT)) {
+ SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+ return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+ DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+ }
+
+ // cttz(x) = ctpop(lsb - 1)
+ SDValue One = DAG.getConstant(1, dl, VT);
+ return DAG.getNode(ISD::CTPOP, dl, VT,
+ DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+ }
+
+ assert(Op.getOpcode() == ISD::CTTZ &&
+ "Only scalar CTTZ requires custom lowering");
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
+
+ // If src is zero (i.e. bsf sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits, dl, VT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)
+ };
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
+}
+
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ assert(VT.is256BitVector() && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDLoc dl(Op);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1));
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1));
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget->hasInt256())
+ return Lower256IntArith(Op, DAG);
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
+ // pairs, multiply and truncate.
+ if (VT == MVT::v16i8 || VT == MVT::v32i8) {
+ if (Subtarget->hasInt256()) {
+ if (VT == MVT::v32i8) {
+ MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
+ SDValue Lo = DAG.getIntPtrConstant(0, dl);
+ SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+ SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
+ SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
+ SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
+ SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
+ DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
+ }
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::MUL, dl, ExVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+ }
+
+ assert(VT == MVT::v16i8 &&
+ "Pre-AVX2 support only supports v16i8 multiplication");
+ MVT ExVT = MVT::v8i16;
+
+ // Extract the lo parts and sign extend to i16
+ SDValue ALo, BLo;
+ if (Subtarget->hasSSE41()) {
+ ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
+ BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+ } else {
+ const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+ -1, 4, -1, 5, -1, 6, -1, 7};
+ ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ ALo = DAG.getBitcast(ExVT, ALo);
+ BLo = DAG.getBitcast(ExVT, BLo);
+ ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+ BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Extract the hi parts and sign extend to i16
+ SDValue AHi, BHi;
+ if (Subtarget->hasSSE41()) {
+ const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
+ BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+ } else {
+ const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
+ -1, 12, -1, 13, -1, 14, -1, 15};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getBitcast(ExVT, AHi);
+ BHi = DAG.getBitcast(ExVT, BHi);
+ AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+ BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Multiply, mask the lower 8bits of the lo/hi results and pack
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+ RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+
+ // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+ if (VT == MVT::v4i32) {
+ assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+ "Should not custom lower when pmuldq is available!");
+
+ // Extract the odd parts.
+ static const int UnpackMask[] = { 1, -1, 3, -1 };
+ SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+ SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+ // Multiply the even parts.
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ // Now multiply odd parts.
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+ Evens = DAG.getBitcast(VT, Evens);
+ Odds = DAG.getBitcast(VT, Odds);
+
+ // Merge the two vectors back together with a shuffle. This expands into 2
+ // shuffles.
+ static const int ShufMask[] = { 0, 4, 2, 6 };
+ return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+ }
+
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+ "Only know how to lower V2I64/V4I64/V8I64 multiply");
+
+ // Ahi = psrlqi(a, 32);
+ // Bhi = psrlqi(b, 32);
+ //
+ // AloBlo = pmuludq(a, b);
+ // AloBhi = pmuludq(a, Bhi);
+ // AhiBlo = pmuludq(Ahi, b);
+
+ // AloBhi = psllqi(AloBhi, 32);
+ // AhiBlo = psllqi(AhiBlo, 32);
+ // return AloBlo + AloBhi + AhiBlo;
+
+ SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+ SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+
+ SDValue AhiBlo = Ahi;
+ SDValue AloBhi = Bhi;
+ // Bit cast to 32-bit vectors for MULUDQ
+ MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+ (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
+ A = DAG.getBitcast(MulVT, A);
+ B = DAG.getBitcast(MulVT, B);
+ Ahi = DAG.getBitcast(MulVT, Ahi);
+ Bhi = DAG.getBitcast(MulVT, Bhi);
+
+ SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+ // After shifting right const values the result may be all-zero.
+ if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
+ AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+ AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
+ }
+ if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
+ AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+ AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+ }
+
+ SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+ return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+}
+
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWin64() && "Unexpected target");
+ EVT VT = Op.getValueType();
+ assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+ "Unexpected return type for lowering");
+
+ RTLIB::Libcall LC;
+ bool isSigned;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
+ case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
+ case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
+ case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
+ case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
+ case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+ }
+
+ SDLoc dl(Op);
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = Op->getOperand(i).getValueType();
+ assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+ "Unexpected argument type for lowering");
+ SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ Entry.Node = StackPtr;
+ InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+ false, false, 16);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Entry.Ty = PointerType::get(ArgTy,0);
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(InChain)
+ .setCallee(getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+ Callee, std::move(Args), 0)
+ .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return DAG.getBitcast(VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+ MVT VT = Op0.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+ // PMULxD operations multiply each even value (starting at 0) of LHS with
+ // the related value of RHS and produce a widen result.
+ // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ //
+ // In other word, to have all the results, we need to perform two PMULxD:
+ // 1. one with the even values.
+ // 2. one with the odd values.
+ // To achieve #2, with need to place the odd values at an even position.
+ //
+ // Place the odd value at an even position (basically, shift all values 1
+ // step to the left):
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+ // <a|b|c|d> => <b|undef|d|undef>
+ SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+ // <e|f|g|h> => <f|undef|h|undef>
+ SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+ bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+ // => <2 x i64> <bf|dh>
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+
+ // Shuffle it back into the right order.
+ SDValue Highs, Lows;
+ if (VT == MVT::v8i32) {
+ const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ } else {
+ const int HighMask[] = {1, 5, 3, 7};
+ Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 4, 2, 6};
+ Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ }
+
+ // If we have a signed multiply but no PMULDQ fix up the high parts of a
+ // unsigned multiply.
+ if (IsSigned && !Subtarget->hasSSE41()) {
+ SDValue ShAmt = DAG.getConstant(
+ 31, dl,
+ DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
+ }
+
+ // The first result of MUL_LOHI is actually the low value, followed by the
+ // high value.
+ SDValue Ops[] = {Lows, Highs};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+// Return true if the required (according to Opcode) shift-imm form is natively
+// supported by the Subtarget
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
+ unsigned Opcode) {
+ if (VT.getScalarSizeInBits() < 16)
+ return false;
+
+ if (VT.is512BitVector() &&
+ (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
+ return true;
+
+ bool LShift = VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget->hasInt256());
+
+ bool AShift = LShift && (Subtarget->hasVLX() ||
+ (VT != MVT::v2i64 && VT != MVT::v4i64));
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+// The shift amount is a variable, but it is the same for all vector lanes.
+// These instructions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
+ unsigned Opcode) {
+ return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+}
+
+// Return true if the required (according to Opcode) variable-shift form is
+// natively supported by the Subtarget
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
+ unsigned Opcode) {
+
+ if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
+ return false;
+
+ // vXi16 supported only on AVX-512, BWI
+ if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
+ return false;
+
+ if (VT.is512BitVector() || Subtarget->hasVLX())
+ return true;
+
+ bool LShift = VT.is128BitVector() || VT.is256BitVector();
+ bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+ auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+ MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+ SDValue Ex = DAG.getBitcast(ExVT, R);
+
+ if (ShiftAmt >= 32) {
+ // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+ SDValue Upper =
+ getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+ SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+ ShiftAmt - 32, DAG);
+ if (VT == MVT::v2i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+ if (VT == MVT::v4i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+ {9, 1, 11, 3, 13, 5, 15, 7});
+ } else {
+ // SRA upper i32, SHL whole i64 and select lower i32.
+ SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+ ShiftAmt, DAG);
+ SDValue Lower =
+ getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+ Lower = DAG.getBitcast(ExVT, Lower);
+ if (VT == MVT::v2i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+ if (VT == MVT::v4i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+ {8, 1, 10, 3, 12, 5, 14, 7});
+ }
+ return DAG.getBitcast(VT, Ex);
+ };
+
+ // Optimize shl/srl/sra with constant shift amount.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+ uint64_t ShiftAmt = ShiftConst->getZExtValue();
+
+ if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+ // i64 SRA needs to be performed as partial shifts.
+ if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
+ return ArithmeticShiftRight64(ShiftAmt);
+
+ if (VT == MVT::v16i8 ||
+ (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+ VT == MVT::v64i8) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+ // Simple i8 add case
+ if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+ // ashr(R, 7) === cmp_slt(R, 0)
+ if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+
+ // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+ if (VT == MVT::v16i8 && Subtarget->hasXOP())
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
+ SHL = DAG.getBitcast(VT, SHL);
+ // Zero out the rightmost bits.
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
+ SRL = DAG.getBitcast(VT, SRL);
+ // Zero out the leftmost bits.
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
+ }
+ if (Op.getOpcode() == ISD::SRA) {
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+
+ SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+ llvm_unreachable("Unknown shift opcode.");
+ }
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
+
+ // Peek through any splat that was introduced for i64 shift vectorization.
+ int SplatIndex = -1;
+ if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
+ if (SVN->isSplat()) {
+ SplatIndex = SVN->getSplatIndex();
+ Amt = Amt.getOperand(0);
+ assert(SplatIndex < (int)VT.getVectorNumElements() &&
+ "Splat shuffle referencing second operand");
+ }
+
+ if (Amt.getOpcode() != ISD::BITCAST ||
+ Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+ uint64_t ShiftAmt = 0;
+ unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
+ for (unsigned i = 0; i != Ratio; ++i) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
+ if (!C)
+ return SDValue();
+ // 6 == Log2(64)
+ ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+ }
+
+ // Check remaining shift amounts (if not a splat).
+ if (SplatIndex < 0) {
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ uint64_t ShAmt = 0;
+ for (unsigned j = 0; j != Ratio; ++j) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+ if (!C)
+ return SDValue();
+ // 6 == Log2(64)
+ ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+ }
+ if (ShAmt != ShiftAmt)
+ return SDValue();
+ }
+ }
+
+ if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+ if (Op.getOpcode() == ISD::SRA)
+ return ArithmeticShiftRight64(ShiftAmt);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+ unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
+
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
+ SDValue BaseShAmt;
+ MVT EltVT = VT.getVectorElementType();
+
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+ // Check if this build_vector node is doing a splat.
+ // If so, then set BaseShAmt equal to the splat value.
+ BaseShAmt = BV->getSplatValue();
+ if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+ BaseShAmt = SDValue();
+ } else {
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ Amt = Amt.getOperand(0);
+
+ ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+ if (SVN && SVN->isSplat()) {
+ unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
+ SDValue InVec = Amt.getOperand(0);
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
+ "Unexpected shuffle index found!");
+ BaseShAmt = InVec.getOperand(SplatIdx);
+ } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+ if (C->getZExtValue() == SplatIdx)
+ BaseShAmt = InVec.getOperand(1);
+ }
+ }
+
+ if (!BaseShAmt)
+ // Avoid introducing an extract element from a shuffle.
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+ DAG.getIntPtrConstant(SplatIdx, dl));
+ }
+ }
+
+ if (BaseShAmt.getNode()) {
+ assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+ if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+ return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget->is64Bit() && VT == MVT::v2i64 &&
+ Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ std::vector<SDValue> Vals(Ratio);
+ for (unsigned i = 0; i != Ratio; ++i)
+ Vals[i] = Amt.getOperand(i);
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ for (unsigned j = 0; j != Ratio; ++j)
+ if (Vals[j] != Amt.getOperand(i + j))
+ return SDValue();
+ }
+
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+ return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+ }
+ return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ assert(VT.isVector() && "Custom lowering only for vector shifts!");
+ assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
+
+ if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
+ return V;
+
+ if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
+ return Op;
+
+ // XOP has 128-bit variable logical/arithmetic shifts.
+ // +ve/-ve Amt = shift left/right.
+ if (Subtarget->hasXOP() &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v8i16 || VT == MVT::v16i8)) {
+ if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+ SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+ }
+ if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+ if (Op.getOpcode() == ISD::SRA)
+ return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+ }
+
+ // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+ // shifts per-lane and then shuffle the partial results back together.
+ if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+ // Splat the shift amounts so the scalar shifts above will catch it.
+ SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+ SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+ SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+ }
+
+ // i64 vector arithmetic shift can be emulated with the transform:
+ // M = lshr(SIGN_BIT, Amt)
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+ if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
+ Op.getOpcode() == ISD::SRA) {
+ SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+ SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+ R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+ return R;
+ }
+
+ // If possible, lower this packed shift into a vector multiply instead of
+ // expanding it into a sequence of scalar shifts.
+ // Do this only if the vector shift count is a constant build_vector.
+ if (Op.getOpcode() == ISD::SHL &&
+ (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ MVT SVT = VT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ APInt One(SVTBits, 1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ for (unsigned i=0; i !=NumElems; ++i) {
+ SDValue Op = Amt->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(Op);
+ continue;
+ }
+
+ ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+ APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+ if (ShAmt >= SVTBits) {
+ Elts.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+ return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+ }
+
+ // Lower SHL with variable shift amount.
+ if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op,
+ DAG.getConstant(0x3f800000U, dl, VT));
+ Op = DAG.getBitcast(MVT::v4f32, Op);
+ Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+ return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+ }
+
+ // If possible, lower this shift as a sequence of two shifts by
+ // constant plus a MOVSS/MOVSD instead of scalarizing it.
+ // Example:
+ // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+ //
+ // Could be rewritten as:
+ // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+ //
+ // The advantage is that the two shifts from the example would be
+ // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+ // the vector shift into four scalar shifts plus four pairs of vector
+ // insert/extract.
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ unsigned TargetOpcode = X86ISD::MOVSS;
+ bool CanBeSimplified;
+ // The splat value for the first packed shift (the 'X' from the example).
+ SDValue Amt1 = Amt->getOperand(0);
+ // The splat value for the second packed shift (the 'Y' from the example).
+ SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+ Amt->getOperand(2);
+
+ // See if it is possible to replace this node with a sequence of
+ // two shifts followed by a MOVSS/MOVSD
+ if (VT == MVT::v4i32) {
+ // Check if it is legal to use a MOVSS.
+ CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+ Amt2 == Amt->getOperand(3);
+ if (!CanBeSimplified) {
+ // Otherwise, check if we can still simplify this node using a MOVSD.
+ CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+ Amt->getOperand(2) == Amt->getOperand(3);
+ TargetOpcode = X86ISD::MOVSD;
+ Amt2 = Amt->getOperand(2);
+ }
+ } else {
+ // Do similar checks for the case where the machine value type
+ // is MVT::v8i16.
+ CanBeSimplified = Amt1 == Amt->getOperand(1);
+ for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+ if (!CanBeSimplified) {
+ TargetOpcode = X86ISD::MOVSD;
+ CanBeSimplified = true;
+ Amt2 = Amt->getOperand(4);
+ for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt1 == Amt->getOperand(i);
+ for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+ CanBeSimplified = Amt2 == Amt->getOperand(j);
+ }
+ }
+
+ if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+ isa<ConstantSDNode>(Amt2)) {
+ // Replace this node with two shifts followed by a MOVSS/MOVSD.
+ MVT CastVT = MVT::v4i32;
+ SDValue Splat1 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
+ SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+ SDValue Splat2 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
+ SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+ if (TargetOpcode == X86ISD::MOVSD)
+ CastVT = MVT::v2i64;
+ SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+ SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
+ SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+ BitCast1, DAG);
+ return DAG.getBitcast(VT, Result);
+ }
+ }
+
+ // v4i32 Non Uniform Shifts.
+ // If the shift amount is constant we can shift each lane using the SSE2
+ // immediate shifts, else we need to zero-extend each lane to the lower i64
+ // and shift using the SSE2 variable shifts.
+ // The separate results can then be blended together.
+ if (VT == MVT::v4i32) {
+ unsigned Opc = Op.getOpcode();
+ SDValue Amt0, Amt1, Amt2, Amt3;
+ if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+ } else {
+ // ISD::SHL is handled above but we include it here for completeness.
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unknown target vector shift node");
+ case ISD::SHL:
+ Opc = X86ISD::VSHL;
+ break;
+ case ISD::SRL:
+ Opc = X86ISD::VSRL;
+ break;
+ case ISD::SRA:
+ Opc = X86ISD::VSRA;
+ break;
+ }
+ // The SSE2 shifts use the lower i64 as the same shift amount for
+ // all lanes and the upper i64 is ignored. These shuffle masks
+ // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+ }
+
+ SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+ SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+ SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+ SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+ SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+ return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+ }
+
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+ unsigned ShiftOpcode = Op->getOpcode();
+
+ auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
+ if (Subtarget->hasSSE41()) {
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we test for the sign bit by comparing to
+ // zero - a negative value will set all bits of the lanes to true
+ // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
+ SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
+ return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+ // We can safely do this using i16 shifts as we're only interested in
+ // the 3 lower bits of each byte.
+ Amt = DAG.getBitcast(ExtVT, Amt);
+ Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
+ Amt = DAG.getBitcast(VT, Amt);
+
+ if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
+ // r = VSELECT(r, shift(r, 4), a);
+ SDValue M =
+ DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // return VSELECT(r, shift(r, 1), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+ return R;
+ }
+
+ if (Op->getOpcode() == ISD::SRA) {
+ // For SRA we need to unpack each byte to the higher byte of a i16 vector
+ // so we can correctly sign extend. We don't care what happens to the
+ // lower byte.
+ SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
+ SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
+ SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
+ SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+
+ // r = VSELECT(r, shift(r, 4), a);
+ SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+ DAG.getConstant(4, dl, ExtVT));
+ SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+ DAG.getConstant(4, dl, ExtVT));
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // a += a
+ ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+ AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+ DAG.getConstant(2, dl, ExtVT));
+ MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+ DAG.getConstant(2, dl, ExtVT));
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // a += a
+ ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+ AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+ // r = VSELECT(r, shift(r, 1), a);
+ MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+ DAG.getConstant(1, dl, ExtVT));
+ MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+ DAG.getConstant(1, dl, ExtVT));
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // Logical shift the result back to the lower byte, leaving a zero upper
+ // byte
+ // meaning that we can safely pack with PACKUSWB.
+ RLo =
+ DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
+ RHi =
+ DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+ }
+
+ // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+ // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+ // solution better.
+ if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+ MVT ExtVT = MVT::v8i32;
+ unsigned ExtOpc =
+ Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+ }
+
+ if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
+ MVT ExtVT = MVT::v8i32;
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
+ SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
+ SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
+ SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+ SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
+ SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
+ Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
+ Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+ }
+
+ if (VT == MVT::v8i16) {
+ unsigned ShiftOpcode = Op->getOpcode();
+
+ auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
+ if (Subtarget->hasSSE41()) {
+ MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+ V0 = DAG.getBitcast(ExtVT, V0);
+ V1 = DAG.getBitcast(ExtVT, V1);
+ Sel = DAG.getBitcast(ExtVT, Sel);
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we splat the sign bit - a negative value will
+ // set all bits of the lanes to true and VSELECT uses that in
+ // its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue C =
+ DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
+ return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
+ if (Subtarget->hasSSE41()) {
+ // On SSE41 targets we need to replicate the shift mask in both
+ // bytes for PBLENDVB.
+ Amt = DAG.getNode(
+ ISD::OR, dl, VT,
+ DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
+ DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
+ } else {
+ Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
+ }
+
+ // r = VSELECT(r, shift(r, 8), a);
+ SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 4), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // return VSELECT(r, shift(r, 1), a);
+ M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+ R = SignBitSelect(Amt, M, R);
+ return R;
+ }
+
+ // Decompose 256-bit shifts into smaller 128-bit shifts.
+ if (VT.is256BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ // Extract the two vectors
+ SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+ SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
+
+ // Recreate the shift amount vectors
+ SDValue Amt1, Amt2;
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ // Constant shift amount
+ SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+ ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+ ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
+
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
+ } else {
+ // Variable shift amount
+ Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+ Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
+ }
+
+ // Issue new vector shifts for the smaller types
+ V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+ V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+ // Concatenate the result back
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ assert(VT.isVector() && "Custom lowering only for vector rotates!");
+ assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+ assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+ // XOP has 128-bit vector variable + immediate rotates.
+ // +ve/-ve Amt = rotate left/right.
+
+ // Split 256-bit integers.
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+
+ assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+ // Attempt to rotate by immediate.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+ uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+ assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+ return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+ DAG.getConstant(RotateAmt, DL, MVT::i8));
+ }
+ }
+
+ // Use general rotate by variable (per-element).
+ return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ unsigned BaseOp = 0;
+ unsigned Cond = 0;
+ SDLoc DL(Op);
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown ovf instruction!");
+ case ISD::SADDO:
+ // A subtract of one will be selected as a INC. Note that INC doesn't
+ // set CF, so we can't do this for UADDO.
+ if (isOneConstant(RHS)) {
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SSUBO:
+ // A subtract of one will be selected as a DEC. Note that DEC doesn't
+ // set CF, so we can't do this for USUBO.
+ if (isOneConstant(RHS)) {
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_O;
+ break;
+ case ISD::USUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SMULO:
+ BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+ if (N->getValueType(0) == MVT::i8) {
+ BaseOp = X86ISD::UMUL8;
+ Cond = X86::COND_O;
+ break;
+ }
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+ MVT::i32);
+ SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+ SDValue SetCC =
+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(X86::COND_O, DL, MVT::i32),
+ SDValue(Sum.getNode(), 2));
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+ }
+ }
+
+ // Also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+ SDValue SetCC =
+ DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+ DAG.getConstant(Cond, DL, MVT::i32),
+ SDValue(Sum.getNode(), 1));
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+/// Returns true if the operand type is exactly twice the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
+ unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+ if (OpWidth == 64)
+ return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ else if (OpWidth == 128)
+ return Subtarget->hasCmpxchg16b();
+ else
+ return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ return needsCmpXchgNb(SI->getValueOperand()->getType());
+}
+
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+ return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+ Type *MemType = AI->getType();
+
+ // If the operand is too big, we must see if cmpxchg8/16b is available
+ // and default to library calls otherwise.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+ }
+
+ AtomicRMWInst::BinOp Op = AI->getOperation();
+ switch (Op) {
+ default:
+ llvm_unreachable("Unknown atomic operation");
+ case AtomicRMWInst::Xchg:
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ // It's better to use xadd, xsub or xchg for these in all cases.
+ return AtomicExpansionKind::None;
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Xor:
+ // If the atomicrmw's result isn't actually used, we can just add a "lock"
+ // prefix to a normal instruction for these operations.
+ return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+ case AtomicRMWInst::Nand:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ // These always require a non-trivial set of data operations on x86. We must
+ // use a cmpxchg loop.
+ return AtomicExpansionKind::CmpXChg;
+ }
+}
+
+static bool hasMFENCE(const X86Subtarget& Subtarget) {
+ // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ // no-sse2). There isn't any reason to disable it if the target processor
+ // supports it.
+ return Subtarget.hasSSE2() || Subtarget.is64Bit();
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+ Type *MemType = AI->getType();
+ // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+ // there is no benefit in turning such RMWs into loads, and it is actually
+ // harmful as it introduces a mfence.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+ return nullptr;
+
+ auto Builder = IRBuilder<>(AI);
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ auto SynchScope = AI->getSynchScope();
+ // We must restrict the ordering to avoid generating loads with Release or
+ // ReleaseAcquire orderings.
+ auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+ auto Ptr = AI->getPointerOperand();
+
+ // Before the load we need a fence. Here is an example lifted from
+ // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+ // is required:
+ // Thread 0:
+ // x.store(1, relaxed);
+ // r1 = y.fetch_add(0, release);
+ // Thread 1:
+ // y.fetch_add(42, acquire);
+ // r2 = x.load(relaxed);
+ // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+ // lowered to just a load without a fence. A mfence flushes the store buffer,
+ // making the optimization clearly correct.
+ // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+ // otherwise, we might be able to be more aggressive on relaxed idempotent
+ // rmw. In practice, they do not look useful, so we don't try to be
+ // especially clever.
+ if (SynchScope == SingleThread)
+ // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+ // the IR level, so we must wrap it in an intrinsic.
+ return nullptr;
+
+ if (!hasMFENCE(*Subtarget))
+ // FIXME: it might make sense to use a locked operation here but on a
+ // different cache-line to prevent cache-line bouncing. In practice it
+ // is probably a small win, and x86 processors without mfence are rare
+ // enough that we do not bother.
+ return nullptr;
+
+ Function *MFence =
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+ Builder.CreateCall(MFence, {});
+
+ // Finally we can emit the atomic load.
+ LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+ AI->getType()->getPrimitiveSizeInBits());
+ Loaded->setAtomic(Order, SynchScope);
+ AI->replaceAllUsesWith(Loaded);
+ AI->eraseFromParent();
+ return Loaded;
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+ if (hasMFENCE(*Subtarget))
+ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, dl, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(0, dl, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i32), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT T = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ unsigned Reg = 0;
+ unsigned size = 0;
+ switch(T.SimpleTy) {
+ default: llvm_unreachable("Invalid value type!");
+ case MVT::i8: Reg = X86::AL; size = 1; break;
+ case MVT::i16: Reg = X86::AX; size = 2; break;
+ case MVT::i32: Reg = X86::EAX; size = 4; break;
+ case MVT::i64:
+ assert(Subtarget->is64Bit() && "Node not type legal!");
+ Reg = X86::RAX; size = 8;
+ break;
+ }
+ SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+ Op.getOperand(2), SDValue());
+ SDValue Ops[] = { cpIn.getValue(0),
+ Op.getOperand(1),
+ Op.getOperand(3),
+ DAG.getTargetConstant(size, DL, MVT::i8),
+ cpIn.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+ Ops, T, MMO);
+
+ SDValue cpOut =
+ DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+ MVT::i32, cpOut.getValue(2));
+ SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+ DAG.getConstant(X86::COND_E, DL, MVT::i8),
+ EFLAGS);
+
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+ DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+ return SDValue();
+}
+
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
+
+ if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ if (DstVT != MVT::f64)
+ // This conversion needs to be expanded.
+ return SDValue();
+
+ SDValue InVec = Op->getOperand(0);
+ SDLoc dl(Op);
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ MVT SVT = SrcVT.getVectorElementType();
+
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+ DAG.getIntPtrConstant(i, dl)));
+
+ // Explicitly mark the extra elements as Undef.
+ Elts.append(NumElts, DAG.getUNDEF(SVT));
+
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+ SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+ Subtarget->hasMMX() && "Unexpected custom BITCAST");
+ assert((DstVT == MVT::i64 ||
+ (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+ "Unexpected custom BITCAST");
+ // i64 <=> MMX conversions are Legal.
+ if (SrcVT==MVT::i64 && DstVT.isVector())
+ return Op;
+ if (DstVT==MVT::i64 && SrcVT.isVector())
+ return Op;
+ // MMX <=> MMX conversions are Legal.
+ if (SrcVT.isVector() && DstVT.isVector())
+ return Op;
+ // All other conversions need to be expanded.
+ return SDValue();
+}
+
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(V);
+ MVT ByteVecVT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ int NumElts = VT.getVectorNumElements();
+ assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+ "Expected value to have byte element type.");
+ assert(EltVT != MVT::i8 &&
+ "Horizontal byte sum only makes sense for wider elements!");
+ unsigned VecSize = VT.getSizeInBits();
+ assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+ // PSADBW instruction horizontally add all bytes and leave the result in i64
+ // chunks, thus directly computes the pop count for v2i64 and v4i64.
+ if (EltVT == MVT::i64) {
+ SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
+ return DAG.getBitcast(VT, V);
+ }
+
+ if (EltVT == MVT::i32) {
+ // We unpack the low half and high half into i32s interleaved with zeros so
+ // that we can use PSADBW to horizontally sum them. The most useful part of
+ // this is that it lines up the results of two PSADBW instructions to be
+ // two v2i64 vectors which concatenated are the 4 population counts. We can
+ // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
+ SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+
+ // Do the horizontal sums into two v2i64s.
+ Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+ DAG.getBitcast(ByteVecVT, Low), Zeros);
+ High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+ DAG.getBitcast(ByteVecVT, High), Zeros);
+
+ // Merge them together.
+ MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+ V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+ DAG.getBitcast(ShortVecVT, Low),
+ DAG.getBitcast(ShortVecVT, High));
+
+ return DAG.getBitcast(VT, V);
+ }
+
+ // The only element type left is i16.
+ assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+ // To obtain pop count for each i16 element starting from the pop count for
+ // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+ // right by 8. It is important to shift as i16s as i8 vector shift isn't
+ // directly supported.
+ SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
+ SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+ V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+ DAG.getBitcast(ByteVecVT, V));
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned VecSize = VT.getSizeInBits();
+
+ // Implement a lookup table in register by using an algorithm based on:
+ // http://wm.ite.pl/articles/sse-popcount.html
+ //
+ // The general idea is that every lower byte nibble in the input vector is an
+ // index into a in-register pre-computed pop count table. We then split up the
+ // input vector in two new ones: (1) a vector with only the shifted-right
+ // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+ // masked out higher ones) for each byte. PSHUB is used separately with both
+ // to index the in-register table. Next, both are added and the result is a
+ // i8 vector where each element contains the pop count for input byte.
+ //
+ // To obtain the pop count for elements != i8, we follow up with the same
+ // approach and use additional tricks as described below.
+ //
+ const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+ /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+ /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+ int NumByteElts = VecSize / 8;
+ MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+ SDValue In = DAG.getBitcast(ByteVecVT, Op);
+ SmallVector<SDValue, 16> LUTVec;
+ for (int i = 0; i < NumByteElts; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
+ SmallVector<SDValue, 16> Mask0F(NumByteElts,
+ DAG.getConstant(0x0F, DL, MVT::i8));
+ SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+
+ // High nibbles
+ SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
+ SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+ SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+ // Low nibbles
+ SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+ // The input vector is used as the shuffle mask that index elements into the
+ // LUT. After counting low and high nibbles, add the vector to obtain the
+ // final pop count per i8 element.
+ SDValue HighPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+ SDValue LowPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+ SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
+
+ if (EltVT == MVT::i8)
+ return PopCnt;
+
+ return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.is128BitVector() &&
+ "Only 128-bit vector bitmath lowering supported.");
+
+ int VecSize = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+ int Len = EltVT.getSizeInBits();
+
+ // This is the vectorized version of the "best" algorithm from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ // with a minor tweak to use a series of adds + shifts instead of vector
+ // multiplications. Implemented for all integer vector types. We only use
+ // this when we don't have SSSE3 which allows a LUT-based lowering that is
+ // much faster, even faster than using native popcnt instructions.
+
+ auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+ MVT VT = V.getSimpleValueType();
+ SmallVector<SDValue, 32> Shifters(
+ VT.getVectorNumElements(),
+ DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
+ return DAG.getNode(OpCode, DL, VT, V,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+ };
+ auto GetMask = [&](SDValue V, APInt Mask) {
+ MVT VT = V.getSimpleValueType();
+ SmallVector<SDValue, 32> Masks(
+ VT.getVectorNumElements(),
+ DAG.getConstant(Mask, DL, VT.getVectorElementType()));
+ return DAG.getNode(ISD::AND, DL, VT, V,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+ };
+
+ // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+ // x86, so set the SRL type to have elements at least i16 wide. This is
+ // correct because all of our SRLs are followed immediately by a mask anyways
+ // that handles any bits that sneak into the high bits of the byte elements.
+ MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
+
+ SDValue V = Op;
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ SDValue Srl =
+ DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+ SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+ V = DAG.getNode(ISD::SUB, DL, VT, V, And);
+
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+ Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+ SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+ V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
+
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+ V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
+
+ // At this point, V contains the byte-wise population count, and we are
+ // merely doing a horizontal sum if necessary to get the wider element
+ // counts.
+ if (EltVT == MVT::i8)
+ return V;
+
+ return LowerHorizontalByteSum(
+ DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+ DAG);
+}
+
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ // FIXME: Need to add AVX-512 support here!
+ assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unknown CTPOP type to handle");
+ SDLoc DL(Op.getNode());
+ SDValue Op0 = Op.getOperand(0);
+
+ if (!Subtarget->hasSSSE3()) {
+ // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+ return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+ }
+
+ if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 128-bit vector, compute pop count and concat the result.
+ SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+ LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+ }
+
+ return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().isVector() &&
+ "We only do custom lowering for vector population count.");
+ return LowerVectorCTPOP(Op, Subtarget, DAG);
+}
+
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ EVT T = Node->getValueType(0);
+ SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+ DAG.getConstant(0, dl, T), Node->getOperand(2));
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), negOp,
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert seq_cst store -> xchg
+ // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+ // FIXME: On 32-bit, store -> fist or movq would be more efficient
+ // (The only way to get a 16-byte store is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+ if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ return Swap.getValue(1);
+ }
+ // Other atomic stores have a simple pattern.
+ return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getNode()->getSimpleValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Invalid code");
+ case ISD::ADDC: Opc = X86ISD::ADD; break;
+ case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+ case ISD::SUBC: Opc = X86ISD::SUB; break;
+ case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+ Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+ // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values as { float, float } (in XMM0) or
+ // { double, double } (which is returned in XMM0, XMM1).
+ SDLoc dl(Op);
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ bool isF64 = ArgVT == MVT::f64;
+ // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+ // the small struct {f32, f32} is returned in (eax, edx). For f64,
+ // the results are returned via SRet in memory.
+ const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Callee =
+ DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
+
+ Type *RetTy = isF64
+ ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
+ : (Type*)VectorType::get(ArgTy, 4);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+ if (isF64)
+ // Returned in xmm0 and xmm1.
+ return CallResult.first;
+
+ // Returned in bits 0:31 and 32:64 xmm0.
+ SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(0, dl));
+ SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(1, dl));
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// Widen a vector input to a vector of NVT. The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+ bool FillWithZeroes = false) {
+ // Check if InOp already has the right width.
+ MVT InVT = InOp.getSimpleValueType();
+ if (InVT == NVT)
+ return InOp;
+
+ if (InOp.isUndef())
+ return DAG.getUNDEF(NVT);
+
+ assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+ "input and widen element type must match");
+
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned WidenNumElts = NVT.getVectorNumElements();
+ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+ "Unexpected request for vector widening");
+
+ EVT EltVT = NVT.getVectorElementType();
+
+ SDLoc dl(InOp);
+ if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+ InOp.getNumOperands() == 2) {
+ SDValue N1 = InOp.getOperand(1);
+ if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+ N1.isUndef()) {
+ InOp = InOp.getOperand(0);
+ InVT = InOp.getSimpleValueType();
+ InNumElts = InVT.getVectorNumElements();
+ }
+ }
+ if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned i = 0; i < InNumElts; ++i)
+ Ops.push_back(InOp.getOperand(i));
+
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+ DAG.getUNDEF(EltVT);
+ for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+ Ops.push_back(FillVal);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+ }
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+ DAG.getUNDEF(NVT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+ InOp, DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ // X86 scatter kills mask register, so its type should be added to
+ // the list of return values.
+ // If the "scatter" has 2 return values, it is already handled.
+ if (Op.getNode()->getNumValues() == 2)
+ return Op;
+
+ MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
+ SDValue Src = N->getValue();
+ MVT VT = Src.getSimpleValueType();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
+ SDLoc dl(Op);
+
+ SDValue NewScatter;
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Chain = N->getChain();
+ SDValue BasePtr = N->getBasePtr();
+ MVT MemVT = N->getMemoryVT().getSimpleVT();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+ // The v2i32 value was promoted to v2i64.
+ // Now we "redo" the type legalizer's work and widen the original
+ // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+ // with a shuffle.
+ assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+ "Unexpected memory type");
+ int ShuffleMask[] = {0, 2, -1, -1};
+ Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+ DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+ // Now we have 4 elements instead of 2.
+ // Expand the index.
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+
+ // Expand the mask with zeroes
+ // Mask may be <2 x i64> or <2 x i1> at this moment
+ assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+ "Unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ VT = MVT::v4i32;
+ }
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (IndexVT == MVT::v8i32)
+ // Just extend index
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ else {
+ // The minimal number of elts in scatter is 8
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ // Use original index here, do not modify the index twice
+ Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ // Use the original mask here, do not modify the mask twice
+ Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+ // The value that should be stored
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src = ExtendToType(Src, NewVT, DAG);
+ }
+ }
+ // If the mask is "wide" at this point - truncate it to i1 vector
+ MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+ // The mask is killed by scatter, add it to the values
+ SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+ NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 0);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+ !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ SDValue Src0 = N->getSrc0();
+ Src0 = ExtendToType(Src0, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+ N->getBasePtr(), Mask, Src0,
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType());
+
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+ SDValue DataToStore = N->getValue();
+ MVT VT = DataToStore.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+ !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+ Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->isTruncatingStore());
+ }
+ return Op;
+}
+
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Src0 = N->getValue();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (NumElts == 8) {
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+ DAG.UpdateNodeOperands(N, Ops);
+ return Op;
+ }
+
+ // Minimal number of elements in Gather
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+ // The pass-thru value
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src0 = ExtendToType(Src0, NewVT, DAG);
+
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewGather.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Should not custom lower this!");
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
+ case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
+ case ISD::VSELECT: return LowerVSELECT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+ case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
+ case ISD::FABS:
+ case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::SETCCE: return LowerSETCCE(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG);
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::UMUL_LOHI:
+ case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
+ case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+ case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ADD: return LowerADD(Op, DAG);
+ case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN: return LowerMINMAX(Op, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
+ case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
+ case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
+ case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
+ case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
+ case ISD::GC_TRANSITION_START:
+ return LowerGC_TRANSITION_START(Op, DAG);
+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
+ }
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case X86ISD::AVG: {
+ // Legalize types for X86ISD::AVG by expanding vectors.
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+
+ auto InVT = N->getValueType(0);
+ auto InVTSize = InVT.getSizeInBits();
+ const unsigned RegSize =
+ (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+ assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+ "512-bit vector requires AVX512");
+ assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+ "256-bit vector requires AVX2");
+
+ auto ElemVT = InVT.getVectorElementType();
+ auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+ RegSize / ElemVT.getSizeInBits());
+ assert(RegSize % InVT.getSizeInBits() == 0);
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+ Ops[0] = N->getOperand(0);
+ SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+ Ops[0] = N->getOperand(1);
+ SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+ SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+ Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+ DAG.getIntPtrConstant(0, dl)));
+ return;
+ }
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ // We don't want to expand or promote these.
+ return;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: {
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+ std::pair<SDValue,SDValue> Vals =
+ FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ if (FIST.getNode()) {
+ EVT VT = N->getValueType(0);
+ // Return a load from the stack slot.
+ if (StackSlot.getNode())
+ Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+ MachinePointerInfo(),
+ false, false, false, 0));
+ else
+ Results.push_back(FIST);
+ }
+ return;
+ }
+ case ISD::UINT_TO_FP: {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ if (N->getOperand(0).getValueType() != MVT::v2i32 ||
+ N->getValueType(0) != MVT::v2f32)
+ return;
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+ N->getOperand(0));
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+ MVT::f64);
+ SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ return;
+ }
+ case ISD::FP_ROUND: {
+ if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+ return;
+ SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ Results.push_back(V);
+ return;
+ }
+ case ISD::FP_EXTEND: {
+ // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
+ // No other ValueType for FP_EXTEND should reach this point.
+ assert(N->getValueType(0) == MVT::v2f32 &&
+ "Do not know how to legalize this Node");
+ return;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default : llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
+ case Intrinsic::x86_rdtsc:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdtscp:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdpmc:
+ return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
+ }
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+ Results.push_back(V);
+ return;
+ }
+ case ISD::READCYCLECOUNTER: {
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ }
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+ EVT T = N->getValueType(0);
+ assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+ bool Regs64bit = T == MVT::i128;
+ MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+ SDValue cpInL, cpInH;
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(0, dl, HalfT));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(1, dl, HalfT));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ cpInH, cpInL.getValue(1));
+ SDValue swapInL, swapInH;
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(0, dl, HalfT));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(1, dl, HalfT));
+ swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX,
+ swapInL, cpInH.getValue(1));
+ swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+ Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, swapInL.getValue(1));
+ SDValue Ops[] = { swapInH.getValue(0),
+ N->getOperand(1),
+ swapInH.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+ unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+ X86ISD::LCMPXCHG8_DAG;
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ HalfT, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ HalfT, cpOutL.getValue(2));
+ SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+ MVT::i32, cpOutH.getValue(2));
+ SDValue Success =
+ DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
+ Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
+ Results.push_back(Success);
+ Results.push_back(EFLAGS.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD: {
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+ }
+ case ISD::BITCAST: {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+ if (SrcVT != MVT::f64 ||
+ (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+ return;
+
+ unsigned NumElts = DstVT.getVectorNumElements();
+ EVT SVT = DstVT.getVectorElementType();
+ EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, N->getOperand(0));
+ SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
+
+ if (ExperimentalVectorWideningLegalization) {
+ // If we are legalizing vectors by widening, we already have the desired
+ // legal vector type, just return it.
+ Results.push_back(ToVecInt);
+ return;
+ }
+
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+ ToVecInt, DAG.getIntPtrConstant(i, dl)));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+ }
+ }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((X86ISD::NodeType)Opcode) {
+ case X86ISD::FIRST_NUMBER: break;
+ case X86ISD::BSF: return "X86ISD::BSF";
+ case X86ISD::BSR: return "X86ISD::BSR";
+ case X86ISD::SHLD: return "X86ISD::SHLD";
+ case X86ISD::SHRD: return "X86ISD::SHRD";
+ case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FANDN: return "X86ISD::FANDN";
+ case X86ISD::FOR: return "X86ISD::FOR";
+ case X86ISD::FXOR: return "X86ISD::FXOR";
+ case X86ISD::FILD: return "X86ISD::FILD";
+ case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
+ case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+ case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+ case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FLD: return "X86ISD::FLD";
+ case X86ISD::FST: return "X86ISD::FST";
+ case X86ISD::CALL: return "X86ISD::CALL";
+ case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
+ case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
+ case X86ISD::BT: return "X86ISD::BT";
+ case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::COMI: return "X86ISD::COMI";
+ case X86ISD::UCOMI: return "X86ISD::UCOMI";
+ case X86ISD::CMPM: return "X86ISD::CMPM";
+ case X86ISD::CMPMU: return "X86ISD::CMPMU";
+ case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
+ case X86ISD::SETCC: return "X86ISD::SETCC";
+ case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
+ case X86ISD::FSETCC: return "X86ISD::FSETCC";
+ case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86";
+ case X86ISD::CMOV: return "X86ISD::CMOV";
+ case X86ISD::BRCOND: return "X86ISD::BRCOND";
+ case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::IRET: return "X86ISD::IRET";
+ case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
+ case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
+ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
+ case X86ISD::Wrapper: return "X86ISD::Wrapper";
+ case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
+ case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
+ case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
+ case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
+ case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
+ case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
+ case X86ISD::PINSRB: return "X86ISD::PINSRB";
+ case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
+ case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
+ case X86ISD::ANDNP: return "X86ISD::ANDNP";
+ case X86ISD::PSIGN: return "X86ISD::PSIGN";
+ case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
+ case X86ISD::ADDUS: return "X86ISD::ADDUS";
+ case X86ISD::SUBUS: return "X86ISD::SUBUS";
+ case X86ISD::HADD: return "X86ISD::HADD";
+ case X86ISD::HSUB: return "X86ISD::HSUB";
+ case X86ISD::FHADD: return "X86ISD::FHADD";
+ case X86ISD::FHSUB: return "X86ISD::FHSUB";
+ case X86ISD::ABS: return "X86ISD::ABS";
+ case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
+ case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
+ case X86ISD::FMAXC: return "X86ISD::FMAXC";
+ case X86ISD::FMINC: return "X86ISD::FMINC";
+ case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
+ case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
+ case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
+ case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
+ case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
+ case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
+ case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
+ case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
+ case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
+ case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
+ case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
+ case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
+ case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VZEXT: return "X86ISD::VZEXT";
+ case X86ISD::VSEXT: return "X86ISD::VSEXT";
+ case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
+ case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
+ case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
+ case X86ISD::VINSERT: return "X86ISD::VINSERT";
+ case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
+ case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD";
+ case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD";
+ case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
+ case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
+ case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
+ case X86ISD::VSHL: return "X86ISD::VSHL";
+ case X86ISD::VSRL: return "X86ISD::VSRL";
+ case X86ISD::VSRA: return "X86ISD::VSRA";
+ case X86ISD::VSHLI: return "X86ISD::VSHLI";
+ case X86ISD::VSRLI: return "X86ISD::VSRLI";
+ case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::CMPP: return "X86ISD::CMPP";
+ case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
+ case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
+ case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
+ case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
+ case X86ISD::ADD: return "X86ISD::ADD";
+ case X86ISD::SUB: return "X86ISD::SUB";
+ case X86ISD::ADC: return "X86ISD::ADC";
+ case X86ISD::SBB: return "X86ISD::SBB";
+ case X86ISD::SMUL: return "X86ISD::SMUL";
+ case X86ISD::UMUL: return "X86ISD::UMUL";
+ case X86ISD::SMUL8: return "X86ISD::SMUL8";
+ case X86ISD::UMUL8: return "X86ISD::UMUL8";
+ case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+ case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
+ case X86ISD::INC: return "X86ISD::INC";
+ case X86ISD::DEC: return "X86ISD::DEC";
+ case X86ISD::OR: return "X86ISD::OR";
+ case X86ISD::XOR: return "X86ISD::XOR";
+ case X86ISD::AND: return "X86ISD::AND";
+ case X86ISD::BEXTR: return "X86ISD::BEXTR";
+ case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
+ case X86ISD::PTEST: return "X86ISD::PTEST";
+ case X86ISD::TESTP: return "X86ISD::TESTP";
+ case X86ISD::TESTM: return "X86ISD::TESTM";
+ case X86ISD::TESTNM: return "X86ISD::TESTNM";
+ case X86ISD::KORTEST: return "X86ISD::KORTEST";
+ case X86ISD::KTEST: return "X86ISD::KTEST";
+ case X86ISD::PACKSS: return "X86ISD::PACKSS";
+ case X86ISD::PACKUS: return "X86ISD::PACKUS";
+ case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
+ case X86ISD::VALIGN: return "X86ISD::VALIGN";
+ case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
+ case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
+ case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
+ case X86ISD::SHUFP: return "X86ISD::SHUFP";
+ case X86ISD::SHUF128: return "X86ISD::SHUF128";
+ case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
+ case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
+ case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
+ case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
+ case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
+ case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
+ case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
+ case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
+ case X86ISD::MOVSD: return "X86ISD::MOVSD";
+ case X86ISD::MOVSS: return "X86ISD::MOVSS";
+ case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
+ case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
+ case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
+ case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
+ case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
+ case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
+ case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
+ case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
+ case X86ISD::VPERMV: return "X86ISD::VPERMV";
+ case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
+ case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
+ case X86ISD::VPERMI: return "X86ISD::VPERMI";
+ case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
+ case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VRANGE: return "X86ISD::VRANGE";
+ case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
+ case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
+ case X86ISD::PSADBW: return "X86ISD::PSADBW";
+ case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
+ case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+ case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
+ case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
+ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
+ case X86ISD::MFENCE: return "X86ISD::MFENCE";
+ case X86ISD::SFENCE: return "X86ISD::SFENCE";
+ case X86ISD::LFENCE: return "X86ISD::LFENCE";
+ case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
+ case X86ISD::SAHF: return "X86ISD::SAHF";
+ case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
+ case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
+ case X86ISD::VPROT: return "X86ISD::VPROT";
+ case X86ISD::VPROTI: return "X86ISD::VPROTI";
+ case X86ISD::VPSHA: return "X86ISD::VPSHA";
+ case X86ISD::VPSHL: return "X86ISD::VPSHL";
+ case X86ISD::VPCOM: return "X86ISD::VPCOM";
+ case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
+ case X86ISD::FMADD: return "X86ISD::FMADD";
+ case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
+ case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
+ case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
+ case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
+ case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
+ case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
+ case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
+ case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
+ case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
+ case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
+ case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
+ case X86ISD::XTEST: return "X86ISD::XTEST";
+ case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
+ case X86ISD::EXPAND: return "X86ISD::EXPAND";
+ case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
+ case X86ISD::SCALEF: return "X86ISD::SCALEF";
+ case X86ISD::ADDS: return "X86ISD::ADDS";
+ case X86ISD::SUBS: return "X86ISD::SUBS";
+ case X86ISD::AVG: return "X86ISD::AVG";
+ case X86ISD::MULHRS: return "X86ISD::MULHRS";
+ case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
+ case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
+ case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND";
+ case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
+ case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
+ case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
+ }
+ return nullptr;
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // X86 supports extremely general addressing modes.
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ Reloc::Model R = getTargetMachine().getRelocationModel();
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
+ return false;
+
+ if (AM.BaseGV) {
+ unsigned GVFlags =
+ Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+ // If a reference to this global requires an extra load, we can't fold it.
+ if (isGlobalStubReference(GVFlags))
+ return false;
+
+ // If BaseGV requires a register for the PIC base, we cannot also have a
+ // BaseReg specified.
+ if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+ return false;
+
+ // If lower 4G is not available, then we must use rip-relative addressing.
+ if ((M != CodeModel::Small || R != Reloc::Static) &&
+ Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+ return false;
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+ unsigned Bits = Ty->getScalarSizeInBits();
+
+ // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+ // particularly cheaper than those without.
+ if (Bits == 8)
+ return false;
+
+ // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+ // variable shifts just as cheap as scalar ones.
+ if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+ return false;
+
+ // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+ // fully general vector.
+ return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ if (!isTypeLegal(EVT::getEVT(Ty1)))
+ return false;
+
+ assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+ // Assuming the caller doesn't have a zeroext or signext return parameter,
+ // truncation all the way down to i1 is valid.
+ return true;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ EVT VT1 = Val.getValueType();
+ if (isZExtFree(VT1, VT2))
+ return true;
+
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ // X86 has 8, 16, and 32-bit zero-extending loads.
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ if (!Subtarget->hasAnyFMA())
+ return false;
+
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+ // i16 instructions are longer (0x66 prefix) and potentially slower.
+ return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ // Not for i1 vectors
+ if (VT.getSimpleVT().getScalarType() == MVT::i1)
+ return false;
+
+ // Very little shuffling can be done for 64-bit vectors right now.
+ if (VT.getSimpleVT().getSizeInBits() == 64)
+ return false;
+
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(VT.getSimpleVT());
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const {
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(Mask, VT);
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ DebugLoc DL = MI->getDebugLoc();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // For the v = xbegin(), we generate
+ //
+ // thisMBB:
+ // xbegin sinkMBB
+ //
+ // mainMBB:
+ // eax = -1
+ //
+ // sinkMBB:
+ // v = eax
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ // xbegin sinkMBB
+ // # fallthrough to mainMBB
+ // # abortion to sinkMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(sinkMBB);
+
+ // mainMBB:
+ // EAX = -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // EAX is live into the sinkMBB
+ sinkMBB->addLiveIn(X86::EAX);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::EAX);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
+ case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+ case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
+ case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+ case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
+ case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+ case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
+ case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+ }
+
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ BuildMI(*BB, MI, dl,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::XMM0);
+
+ MI->eraseFromParent();
+ return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
+ case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+ case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
+ case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+ case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
+ case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+ case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
+ case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+ }
+
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI->getNumOperands(); // remove the results
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ BuildMI(*BB, MI, dl,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::ECX);
+
+ MI->eraseFromParent();
+ return BB;
+}
+
+static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ // insert input VAL into EAX
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI->getOperand(0).getReg());
+ // insert zero to ECX
+ BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+ .addReg(X86::ECX)
+ .addReg(X86::ECX);
+ // insert zero to EDX
+ BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX)
+ .addReg(X86::EDX)
+ .addReg(X86::EDX);
+ // insert WRPKRU instruction
+ BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ // insert zero to ECX
+ BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+ .addReg(X86::ECX)
+ .addReg(X86::ECX);
+ // insert RDPKRU instruction
+ BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::EAX);
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ // Address into RAX/EAX, other two args into ECX, EDX.
+ unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+ unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(i));
+
+ unsigned ValOps = X86::AddrNumOperands;
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI->getOperand(ValOps).getReg());
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+ .addReg(MI->getOperand(ValOps+1).getReg());
+
+ // The instruction doesn't actually take any operands though.
+ BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ // Emit va_arg instruction on X86-64.
+
+ // Operands to this pseudo-instruction:
+ // 0 ) Output : destination address (reg)
+ // 1-5) Input : va_list address (addr, i64mem)
+ // 6 ) ArgSize : Size (in bytes) of vararg type
+ // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+ // 8 ) Align : Alignment of type
+ // 9 ) EFLAGS (implicit-def)
+
+ assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+ static_assert(X86::AddrNumOperands == 5,
+ "VAARG_64 assumes 5 address operands");
+
+ unsigned DestReg = MI->getOperand(0).getReg();
+ MachineOperand &Base = MI->getOperand(1);
+ MachineOperand &Scale = MI->getOperand(2);
+ MachineOperand &Index = MI->getOperand(3);
+ MachineOperand &Disp = MI->getOperand(4);
+ MachineOperand &Segment = MI->getOperand(5);
+ unsigned ArgSize = MI->getOperand(6).getImm();
+ unsigned ArgMode = MI->getOperand(7).getImm();
+ unsigned Align = MI->getOperand(8).getImm();
+
+ // Memory Reference
+ assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ // Machine Information
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+ const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+ DebugLoc DL = MI->getDebugLoc();
+
+ // struct va_list {
+ // i32 gp_offset
+ // i32 fp_offset
+ // i64 overflow_area (address)
+ // i64 reg_save_area (address)
+ // }
+ // sizeof(va_list) = 24
+ // alignment(va_list) = 8
+
+ unsigned TotalNumIntRegs = 6;
+ unsigned TotalNumXMMRegs = 8;
+ bool UseGPOffset = (ArgMode == 1);
+ bool UseFPOffset = (ArgMode == 2);
+ unsigned MaxOffset = TotalNumIntRegs * 8 +
+ (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+ /* Align ArgSize to a multiple of 8 */
+ unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+ bool NeedsAlign = (Align > 8);
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *overflowMBB;
+ MachineBasicBlock *offsetMBB;
+ MachineBasicBlock *endMBB;
+
+ unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
+ unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
+ unsigned OffsetReg = 0;
+
+ if (!UseGPOffset && !UseFPOffset) {
+ // If we only pull from the overflow region, we don't create a branch.
+ // We don't need to alter control flow.
+ OffsetDestReg = 0; // unused
+ OverflowDestReg = DestReg;
+
+ offsetMBB = nullptr;
+ overflowMBB = thisMBB;
+ endMBB = thisMBB;
+ } else {
+ // First emit code to check if gp_offset (or fp_offset) is below the bound.
+ // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+ // If not, pull from overflow_area. (branch to overflowMBB)
+ //
+ // thisMBB
+ // | .
+ // | .
+ // offsetMBB overflowMBB
+ // | .
+ // | .
+ // endMBB
+
+ // Registers for the PHI in endMBB
+ OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+ OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *MF = MBB->getParent();
+ overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+
+ // Insert the new basic blocks
+ MF->insert(MBBIter, offsetMBB);
+ MF->insert(MBBIter, overflowMBB);
+ MF->insert(MBBIter, endMBB);
+
+ // Transfer the remainder of MBB and its successor edges to endMBB.
+ endMBB->splice(endMBB->begin(), thisMBB,
+ std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+ endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+ // Make offsetMBB and overflowMBB successors of thisMBB
+ thisMBB->addSuccessor(offsetMBB);
+ thisMBB->addSuccessor(overflowMBB);
+
+ // endMBB is a successor of both offsetMBB and overflowMBB
+ offsetMBB->addSuccessor(endMBB);
+ overflowMBB->addSuccessor(endMBB);
+
+ // Load the offset value into a register
+ OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Check if there is enough room left to pull this argument.
+ BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+ .addReg(OffsetReg)
+ .addImm(MaxOffset + 8 - ArgSizeA8);
+
+ // Branch to "overflowMBB" if offset >= max
+ // Fall through to "offsetMBB" otherwise
+ BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+ .addMBB(overflowMBB);
+ }
+
+ // In offsetMBB, emit code to use the reg_save_area.
+ if (offsetMBB) {
+ assert(OffsetReg != 0);
+
+ // Read the reg_save_area address.
+ unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 16)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Zero-extend the offset
+ unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addImm(X86::sub_32bit);
+
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+ .addReg(OffsetReg64)
+ .addReg(RegSaveReg);
+
+ // Compute the offset for the next argument
+ unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+ .addReg(OffsetReg)
+ .addImm(UseFPOffset ? 16 : 8);
+
+ // Store it back into the va_list.
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .addOperand(Segment)
+ .addReg(NextOffsetReg)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Jump to endMBB
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
+ .addMBB(endMBB);
+ }
+
+ //
+ // Emit code to use overflow area
+ //
+
+ // Load the overflow_area address into a register.
+ unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 8)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // If we need to align it, do so. Otherwise, just copy the address
+ // to OverflowDestReg.
+ if (NeedsAlign) {
+ // Align the overflow address
+ assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+ unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+ // aligned_addr = (addr + (align-1)) & ~(align-1)
+ BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+ .addReg(OverflowAddrReg)
+ .addImm(Align-1);
+
+ BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+ .addReg(TmpReg)
+ .addImm(~(uint64_t)(Align-1));
+ } else {
+ BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+ .addReg(OverflowAddrReg);
+ }
+
+ // Compute the next overflow address after this argument.
+ // (the overflow address should be kept 8-byte aligned)
+ unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+ .addReg(OverflowDestReg)
+ .addImm(ArgSizeA8);
+
+ // Store the new overflow address.
+ BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 8)
+ .addOperand(Segment)
+ .addReg(NextAddrReg)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // If we branched, emit the PHI to the front of endMBB.
+ if (offsetMBB) {
+ BuildMI(*endMBB, endMBB->begin(), DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(OffsetDestReg).addMBB(offsetMBB)
+ .addReg(OverflowDestReg).addMBB(overflowMBB);
+ }
+
+ // Erase the pseudo instruction
+ MI->eraseFromParent();
+
+ return endMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ // Emit code to save XMM registers to the stack. The ABI says that the
+ // number of registers to save is given in %al, so it's theoretically
+ // possible to do an indirect jump trick to avoid saving all of them,
+ // however this code takes a simpler approach and just executes all
+ // of the stores if %al is non-zero. It's less code, and it's probably
+ // easier on the hardware branch predictor, and stores aren't all that
+ // expensive anyway.
+
+ // Create the new basic blocks. One block contains all the XMM stores,
+ // and one block is the final destination regardless of whether any
+ // stores were performed.
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *F = MBB->getParent();
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, XMMSaveMBB);
+ F->insert(MBBIter, EndMBB);
+
+ // Transfer the remainder of MBB and its successor edges to EndMBB.
+ EndMBB->splice(EndMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // The original block will now fall through to the XMM save block.
+ MBB->addSuccessor(XMMSaveMBB);
+ // The XMMSaveMBB will fall through to the end block.
+ XMMSaveMBB->addSuccessor(EndMBB);
+
+ // Now add the instructions.
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned CountReg = MI->getOperand(0).getReg();
+ int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+ int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+ if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
+ // If %al is 0, branch around the XMM save block.
+ BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+ BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+ MBB->addSuccessor(EndMBB);
+ }
+
+ // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+ // that was just emitted, but clearly shouldn't be "saved".
+ assert((MI->getNumOperands() <= 3 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
+ MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
+ && "Expected last argument to be EFLAGS");
+ unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ // In the XMM save block, save all the XMM argument registers.
+ for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
+ int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+ MachineMemOperand *MMO = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
+ MachineMemOperand::MOStore,
+ /*Size=*/16, /*Align=*/16);
+ BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+ .addFrameIndex(RegSaveFrameIndex)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(MI->getOperand(i).getReg())
+ .addMemOperand(MMO);
+ }
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+
+ return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock* BB,
+ const TargetRegisterInfo* TRI) {
+ // Scan forward through BB for a use/def of EFLAGS.
+ MachineBasicBlock::iterator miI(std::next(SelectItr));
+ for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(X86::EFLAGS))
+ return false;
+ if (mi.definesRegister(X86::EFLAGS))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether EFLAGS is live into a
+ // successor.
+ if (miI == BB->end()) {
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+ }
+
+ // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+ // out. SelectMI should have a kill flag on EFLAGS.
+ SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+ return true;
+}
+
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a BB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted BB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2, we lower cascaded CMOVs such as
+ //
+ // (CMOV (CMOV F, T, cc1), T, cc2)
+ //
+ // to two successives branches. For that, we look for another CMOV as the
+ // following instruction.
+ //
+ // Without this, we would add a PHI between the two jumps, which ends up
+ // creating a few copies all around. For instance, for
+ //
+ // (sitofp (zext (fcmp une)))
+ //
+ // we would generate:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // movaps %xmm0, %xmm1
+ // jne .LBB5_2
+ // xorps %xmm1, %xmm1
+ // .LBB5_2:
+ // jp .LBB5_4
+ // movaps %xmm1, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ // because this custom-inserter would have generated:
+ //
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ // | \
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // B: empty
+ // C: Z = PHI [X, A], [Y, B]
+ // D: empty
+ // E: PHI [X, C], [Z, D]
+ //
+ // If we lower both CMOVs in a single step, we can instead generate:
+ //
+ // A
+ // | \
+ // | C
+ // | /|
+ // |/ |
+ // | |
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // D: empty
+ // E: PHI [X, A], [X, C], [Y, D]
+ //
+ // Which, in our sitofp/fcmp example, gives us something like:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // jne .LBB5_4
+ // jp .LBB5_4
+ // xorps %xmm0, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ MachineInstr *CascadedCMOV = nullptr;
+ MachineInstr *LastCMOV = MI;
+ X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+
+ // Check for case 1, where there are multiple CMOVs with the same condition
+ // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
+ // number of jumps the most.
+
+ if (isCMOVPseudo(MI)) {
+ // See if we have a string of CMOVS with the same condition.
+ while (NextMIIt != BB->end() &&
+ isCMOVPseudo(NextMIIt) &&
+ (NextMIIt->getOperand(3).getImm() == CC ||
+ NextMIIt->getOperand(3).getImm() == OppCC)) {
+ LastCMOV = &*NextMIIt;
+ ++NextMIIt;
+ }
+ }
+
+ // This checks for case 2, but only do this if we didn't already find
+ // case 1, as indicated by LastCMOV == MI.
+ if (LastCMOV == MI &&
+ NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+ CascadedCMOV = &*NextMIIt;
+ }
+
+ MachineBasicBlock *jcc1MBB = nullptr;
+
+ // If we have a cascaded CMOV, we lower it to two successive branches to
+ // the same block. EFLAGS is used by both, so mark it as live in the second.
+ if (CascadedCMOV) {
+ jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, jcc1MBB);
+ jcc1MBB->addLiveIn(X86::EFLAGS);
+ }
+
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
+ if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
+ copy0MBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add the true and fallthrough blocks as its successors.
+ if (CascadedCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
+ BB->addSuccessor(jcc1MBB);
+
+ // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+ // jump to the sinkMBB.
+ jcc1MBB->addSuccessor(copy0MBB);
+ jcc1MBB->addSuccessor(sinkMBB);
+ } else {
+ BB->addSuccessor(copy0MBB);
+ }
+
+ // The true block target of the first (or only) branch is always sinkMBB.
+ BB->addSuccessor(sinkMBB);
+
+ // Create the conditional branch instruction.
+ unsigned Opc = X86::GetCondBranchFromCond(CC);
+ BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+ if (CascadedCMOV) {
+ unsigned Opc2 = X86::GetCondBranchFromCond(
+ (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
+ BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+ }
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ copy0MBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned Op1Reg = MIIt->getOperand(1).getReg();
+ unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg).addMBB(copy0MBB)
+ .addReg(Op2Reg).addMBB(thisMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ // If we have a cascaded CMOV, the second Jcc provides the same incoming
+ // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+ if (CascadedCMOV) {
+ MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+ DL, TII->get(TargetOpcode::COPY),
+ CascadedCMOV->getOperand(0).getReg())
+ .addReg(MI->getOperand(0).getReg());
+ CascadedCMOV->eraseFromParent();
+ }
+
+ // Now remove the CMOV(s).
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+ (MIIt++)->eraseFromParent();
+
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ // Combine the following atomic floating-point modification pattern:
+ // a.store(reg OP a.load(acquire), release)
+ // Transform them into:
+ // OPss (%gpr), %xmm
+ // movss %xmm, (%gpr)
+ // Or sd equivalent for 64-bit operations.
+ unsigned MOp, FOp;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+ case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
+ case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+ }
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ MachineOperand MSrc = MI->getOperand(0);
+ unsigned VSrc = MI->getOperand(5).getReg();
+ const MachineOperand &Disp = MI->getOperand(3);
+ MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
+ bool hasDisp = Disp.isGlobal() || Disp.isImm();
+ if (hasDisp && MSrc.isReg())
+ MSrc.setIsKill(false);
+ MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
+ .addOperand(/*Base=*/MSrc)
+ .addImm(/*Scale=*/1)
+ .addReg(/*Index=*/0)
+ .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+ .addReg(0);
+ MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
+ MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+ .addReg(VSrc)
+ .addOperand(/*Base=*/MSrc)
+ .addImm(/*Scale=*/1)
+ .addReg(/*Index=*/0)
+ .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+ .addReg(/*Segment=*/0);
+ MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+ assert(MF->shouldSplitStack());
+
+ const bool Is64Bit = Subtarget->is64Bit();
+ const bool IsLP64 = Subtarget->isTarget64BitLP64();
+
+ const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
+
+ // BB:
+ // ... [Till the alloca]
+ // If stacklet is not large enough, jump to mallocMBB
+ //
+ // bumpMBB:
+ // Allocate by subtracting from RSP
+ // Jump to continueMBB
+ //
+ // mallocMBB:
+ // Allocate by call to runtime
+ //
+ // continueMBB:
+ // ...
+ // [rest of original BB]
+ //
+
+ MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(getPointerTy(MF->getDataLayout()));
+
+ unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI->getOperand(1).getReg(),
+ physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
+
+ MachineFunction::iterator MBBIter = ++BB->getIterator();
+
+ MF->insert(MBBIter, bumpMBB);
+ MF->insert(MBBIter, mallocMBB);
+ MF->insert(MBBIter, continueMBB);
+
+ continueMBB->splice(continueMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add code to the main basic block to check if the stack limit has been hit,
+ // and if so, jump to mallocMBB otherwise to bumpMBB.
+ BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+ .addReg(tmpSPVReg).addReg(sizeVReg);
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(SPLimitVReg);
+ BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+
+ // bumpMBB simply decreases the stack pointer, since we know the current
+ // stacklet has enough space.
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+ // Calls into a routine in libgcc to allocate more space from the heap.
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+ if (IsLP64) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
+ .addReg(X86::RAX, RegState::ImplicitDefine);
+ } else if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EDI, RegState::Implicit)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ } else {
+ BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+ .addImm(12);
+ BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ }
+
+ if (!Is64Bit)
+ BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+ .addImm(16);
+
+ BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+ .addReg(IsLP64 ? X86::RAX : X86::EAX);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+ // Set up the CFG correctly.
+ BB->addSuccessor(bumpMBB);
+ BB->addSuccessor(mallocMBB);
+ mallocMBB->addSuccessor(continueMBB);
+ bumpMBB->addSuccessor(continueMBB);
+
+ // Take care of the PHI nodes.
+ BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(mallocPtrVReg).addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+ // Delete the original pseudo instruction.
+ MI->eraseFromParent();
+
+ // And we're done.
+ return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ assert(!Subtarget->isTargetMachO());
+ DebugLoc DL = MI->getDebugLoc();
+ MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
+ *BB->getParent(), *BB, MI, DL, false);
+ MachineBasicBlock *ResumeBB = ResumeMI->getParent();
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return ResumeBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
+ DebugLoc DL = MI->getDebugLoc();
+
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+ "SEH does not use catchret!");
+
+ // Only 32-bit EH needs to worry about manually restoring stack pointers.
+ if (!Subtarget->is32Bit())
+ return BB;
+
+ // C++ EH creates a new target block to hold the restore code, and wires up
+ // the new block to the return destination with a normal JMP_4.
+ MachineBasicBlock *RestoreMBB =
+ MF->CreateMachineBasicBlock(BB->getBasicBlock());
+ assert(BB->succ_size() == 1);
+ MF->insert(std::next(BB->getIterator()), RestoreMBB);
+ RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+ BB->addSuccessor(RestoreMBB);
+ MI->getOperand(0).setMBB(RestoreMBB);
+
+ auto RestoreMBBI = RestoreMBB->begin();
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+ bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
+ // Only 32-bit SEH requires special handling for catchpad.
+ if (IsSEH && Subtarget->is32Bit()) {
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
+ }
+ MI->eraseFromParent();
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ // This is pretty easy. We're taking the value that we received from
+ // our load from the relocation, sticking it in either RDI (x86-64)
+ // or EAX and doing an indirect call. The return value will then
+ // be in the normal return register.
+ MachineFunction *F = BB->getParent();
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
+ assert(MI->getOperand(3).isGlobal() && "This should be a global");
+
+ // Get a register mask for the lowered call.
+ // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+ // proper register mask.
+ const uint32_t *RegMask =
+ Subtarget->is64Bit() ?
+ Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+ if (Subtarget->is64Bit()) {
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+ TII->get(X86::MOV64rm), X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(0).addReg(0)
+ .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+ MI->getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+ addDirectMem(MIB, X86::RDI);
+ MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+ TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(0)
+ .addImm(0).addReg(0)
+ .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+ MI->getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else {
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+ TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(TII->getGlobalBaseReg(F))
+ .addImm(0).addReg(0)
+ .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+ MI->getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ }
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstReg;
+ unsigned MemOpndSlot = 0;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI->getOperand(CurOp++).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+ MemOpndSlot = CurOp;
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
+ // SjLjSetup restoreMBB
+ //
+ // mainMBB:
+ // v_main = 0
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+ //
+ // restoreMBB:
+ // if base pointer being used, load it from frame
+ // v_restore = 1
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+ MF->push_back(restoreMBB);
+ restoreMBB->setHasAddressTaken();
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ unsigned PtrStoreOpc = 0;
+ unsigned LabelReg = 0;
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ Reloc::Model RM = MF->getTarget().getRelocationModel();
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+ (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+ // Prepare IP either in reg or imm.
+ if (!UseImmLabel) {
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ LabelReg = MRI.createVirtualRegister(PtrRC);
+ if (Subtarget->is64Bit()) {
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB)
+ .addReg(0);
+ } else {
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+ .addReg(XII->getGlobalBaseReg(MF))
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+ .addReg(0);
+ }
+ } else
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ // Store IP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+ else
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ }
+ if (!UseImmLabel)
+ MIB.addReg(LabelReg);
+ else
+ MIB.addMBB(restoreMBB);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Setup
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+ .addMBB(restoreMBB);
+
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ MIB.addRegMask(RegInfo->getNoPreservedMask());
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(restoreMBB);
+
+ // mainMBB:
+ // EAX = 0
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(restoreMBB);
+
+ // restoreMBB:
+ if (RegInfo->hasBasePointer(*MF)) {
+ const bool Uses64BitFramePtr =
+ Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ X86FI->setRestoreBasePointer(MF);
+ unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+ unsigned BasePtr = RegInfo->getBaseRegister();
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+ restoreMBB->addSuccessor(sinkMBB);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ const TargetRegisterClass *RC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ unsigned SP = RegInfo->getStackRegister();
+
+ MachineInstrBuilder MIB;
+
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+ // Reload FP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload IP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(i), LabelOffset);
+ else
+ MIB.addOperand(MI->getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload SP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(i), SPOffset);
+ else
+ MIB.addOperand(MI->getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Jump
+ BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+ MI->eraseFromParent();
+ return MBB;
+}
+
+// Replace 213-type (isel default) FMA3 instructions with 231-type for
+// accumulator loops. Writing back to the accumulator allows the coalescer
+// to remove extra copies in the loop.
+// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937).
+MachineBasicBlock *
+X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ MachineOperand &AddendOp = MI->getOperand(3);
+
+ // Bail out early if the addend isn't a register - we can't switch these.
+ if (!AddendOp.isReg())
+ return MBB;
+
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Check whether the addend is defined by a PHI:
+ assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
+ MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
+ if (!AddendDef.isPHI())
+ return MBB;
+
+ // Look for the following pattern:
+ // loop:
+ // %addend = phi [%entry, 0], [%loop, %result]
+ // ...
+ // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
+
+ // Replace with:
+ // loop:
+ // %addend = phi [%entry, 0], [%loop, %result]
+ // ...
+ // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
+
+ for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
+ assert(AddendDef.getOperand(i).isReg());
+ MachineOperand PHISrcOp = AddendDef.getOperand(i);
+ MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
+ if (&PHISrcInst == MI) {
+ // Found a matching instruction.
+ unsigned NewFMAOpc = 0;
+ switch (MI->getOpcode()) {
+ case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
+ case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
+ case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
+ case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
+ case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
+ case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
+ case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
+ case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
+ case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
+ case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
+ case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
+ case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
+ case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
+ case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
+ case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
+ case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+ case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
+ case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
+ case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
+ case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
+
+ case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
+ case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
+ case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
+ case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
+ case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
+ case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
+ case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
+ case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+ case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
+ case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
+ case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
+ case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
+ default: llvm_unreachable("Unrecognized FMA variant.");
+ }
+
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(3))
+ .addOperand(MI->getOperand(2))
+ .addOperand(MI->getOperand(1));
+ MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+ MI->eraseFromParent();
+ }
+ }
+
+ return MBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected instr type to insert");
+ case X86::TAILJMPd64:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPd64_REX:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
+ llvm_unreachable("TAILJMP64 would not be touched here.");
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ return BB;
+ case X86::WIN_ALLOCA:
+ return EmitLoweredWinAlloca(MI, BB);
+ case X86::CATCHRET:
+ return EmitLoweredCatchRet(MI, BB);
+ case X86::CATCHPAD:
+ return EmitLoweredCatchPad(MI, BB);
+ case X86::SEG_ALLOCA_32:
+ case X86::SEG_ALLOCA_64:
+ return EmitLoweredSegAlloca(MI, BB);
+ case X86::TLSCall_32:
+ case X86::TLSCall_64:
+ return EmitLoweredTLSCall(MI, BB);
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_FR128:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
+ return EmitLoweredSelect(MI, BB);
+
+ case X86::RDFLAGS32:
+ case X86::RDFLAGS64: {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned PushF =
+ MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+ unsigned Pop =
+ MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+ BuildMI(*BB, MI, DL, TII->get(PushF));
+ BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg());
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::WRFLAGS32:
+ case X86::WRFLAGS64: {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned Push =
+ MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+ unsigned PopF =
+ MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+ BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(PopF));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::RELEASE_FADD32mr:
+ case X86::RELEASE_FADD64mr:
+ return EmitLoweredAtomicFP(MI, BB);
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM:
+ case X86::FP80_TO_INT16_IN_MEM:
+ case X86::FP80_TO_INT32_IN_MEM:
+ case X86::FP80_TO_INT64_IN_MEM: {
+ MachineFunction *F = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Load the old value of the high byte of the control word...
+ unsigned OldCW =
+ F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+ CWFrameIdx);
+
+ // Set the high part to be round to zero...
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+ .addImm(0xC7F);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ // Restore the memory image of control word to original value
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+ .addReg(OldCW);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+ case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+ case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+ }
+
+ X86AddressMode AM;
+ MachineOperand &Op = MI->getOperand(0);
+ if (Op.isReg()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op.getIndex();
+ }
+ Op = MI->getOperand(1);
+ if (Op.isImm())
+ AM.Scale = Op.getImm();
+ Op = MI->getOperand(2);
+ if (Op.isImm())
+ AM.IndexReg = Op.getImm();
+ Op = MI->getOperand(3);
+ if (Op.isGlobal()) {
+ AM.GV = Op.getGlobal();
+ } else {
+ AM.Disp = Op.getImm();
+ }
+ addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+ .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+ }
+ // String/text processing lowering.
+ case X86::PCMPISTRM128REG:
+ case X86::VPCMPISTRM128REG:
+ case X86::PCMPISTRM128MEM:
+ case X86::VPCMPISTRM128MEM:
+ case X86::PCMPESTRM128REG:
+ case X86::VPCMPESTRM128REG:
+ case X86::PCMPESTRM128MEM:
+ case X86::VPCMPESTRM128MEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
+
+ // String/text processing lowering.
+ case X86::PCMPISTRIREG:
+ case X86::VPCMPISTRIREG:
+ case X86::PCMPISTRIMEM:
+ case X86::VPCMPISTRIMEM:
+ case X86::PCMPESTRIREG:
+ case X86::VPCMPESTRIREG:
+ case X86::PCMPESTRIMEM:
+ case X86::VPCMPESTRIMEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
+
+ // Thread synchronization.
+ case X86::MONITOR:
+ return EmitMonitor(MI, BB, Subtarget);
+ // PKU feature
+ case X86::WRPKRU:
+ return EmitWRPKRU(MI, BB, Subtarget);
+ case X86::RDPKRU:
+ return EmitRDPKRU(MI, BB, Subtarget);
+ // xbegin
+ case X86::XBEGIN:
+ return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
+
+ case X86::VASTART_SAVE_XMM_REGS:
+ return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+ case X86::VAARG_64:
+ return EmitVAARG64WithCustomInserter(MI, BB);
+
+ case X86::EH_SjLj_SetJmp32:
+ case X86::EH_SjLj_SetJmp64:
+ return emitEHSjLjSetJmp(MI, BB);
+
+ case X86::EH_SjLj_LongJmp32:
+ case X86::EH_SjLj_LongJmp64:
+ return emitEHSjLjLongJmp(MI, BB);
+
+ case TargetOpcode::STATEPOINT:
+ // As an implementation detail, STATEPOINT shares the STACKMAP format at
+ // this point in the process. We diverge later.
+ return emitPatchPoint(MI, BB);
+
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, BB);
+
+ case X86::VFMADDPDr213r:
+ case X86::VFMADDPSr213r:
+ case X86::VFMADDSDr213r:
+ case X86::VFMADDSSr213r:
+ case X86::VFMSUBPDr213r:
+ case X86::VFMSUBPSr213r:
+ case X86::VFMSUBSDr213r:
+ case X86::VFMSUBSSr213r:
+ case X86::VFNMADDPDr213r:
+ case X86::VFNMADDPSr213r:
+ case X86::VFNMADDSDr213r:
+ case X86::VFNMADDSSr213r:
+ case X86::VFNMSUBPDr213r:
+ case X86::VFNMSUBPSr213r:
+ case X86::VFNMSUBSDr213r:
+ case X86::VFNMSUBSSr213r:
+ case X86::VFMADDSUBPDr213r:
+ case X86::VFMADDSUBPSr213r:
+ case X86::VFMSUBADDPDr213r:
+ case X86::VFMSUBADDPSr213r:
+ case X86::VFMADDPDr213rY:
+ case X86::VFMADDPSr213rY:
+ case X86::VFMSUBPDr213rY:
+ case X86::VFMSUBPSr213rY:
+ case X86::VFNMADDPDr213rY:
+ case X86::VFNMADDPSr213rY:
+ case X86::VFNMSUBPDr213rY:
+ case X86::VFNMSUBPSr213rY:
+ case X86::VFMADDSUBPDr213rY:
+ case X86::VFMADDSUBPSr213rY:
+ case X86::VFMSUBADDPDr213rY:
+ case X86::VFMSUBADDPSr213rY:
+ return emitFMA3Instr(MI, BB);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned BitWidth = KnownZero.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
+ switch (Opc) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ // These nodes' second result is a boolean.
+ if (Op.getResNo() == 0)
+ break;
+ // Fallthrough
+ case X86ISD::SETCC:
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+ break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned NumLoBits = 0;
+ switch (IntId) {
+ default: break;
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx2_pmovmskb: {
+ // High bits of movmskp{s|d}, pmovmskb are known zero.
+ switch (IntId) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
+ case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
+ case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
+ case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
+ case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
+ case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
+ case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
+ }
+ KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+ break;
+ }
+ }
+ break;
+ }
+ }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &,
+ unsigned Depth) const {
+ // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+ if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+ return Op.getValueType().getScalarSizeInBits();
+
+ // Fallback case.
+ return 1;
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+ const GlobalValue* &GA,
+ int64_t &Offset) const {
+ if (N->getOpcode() == X86ISD::Wrapper) {
+ if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+ GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+ Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+ return true;
+ }
+ }
+ return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget* Subtarget) {
+ SDLoc dl(N);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ MVT VT = SVOp->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+ V2.getOpcode() == ISD::CONCAT_VECTORS) {
+ //
+ // 0,0,0,...
+ // |
+ // V UNDEF BUILD_VECTOR UNDEF
+ // \ / \ /
+ // CONCAT_VECTOR CONCAT_VECTOR
+ // \ /
+ // \ /
+ // RESULT: V + zero extended
+ //
+ if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+ V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+ V1.getOperand(1).getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+ return SDValue();
+
+ // To match the shuffle mask, the first half of the mask should
+ // be exactly the first vector, and all the rest a splat with the
+ // first element of the second one.
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+ !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+ return SDValue();
+
+ // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+ if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+ if (Ld->hasNUsesOfValue(1, 0)) {
+ SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+ SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ Ld->getMemoryVT(),
+ Ld->getPointerInfo(),
+ Ld->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+
+ // Make sure the newly-created LOAD is in the same position as Ld in
+ // terms of dependency. We create a TokenFactor for Ld and ResNode,
+ // and update uses of Ld's output chain to use the TokenFactor.
+ if (Ld->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
+
+ return DAG.getBitcast(VT, ResNode);
+ }
+ }
+
+ // Emit a zeroed vector and insert the desired subvector on its
+ // first half.
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ return SDValue();
+}
+
+/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combinine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
+ int Depth, bool HasPSHUFB, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
+
+ // Find the operand that enters the chain. Note that multiple uses are OK
+ // here, we're not going to remove the operand we find.
+ SDValue Input = Op.getOperand(0);
+ while (Input.getOpcode() == ISD::BITCAST)
+ Input = Input.getOperand(0);
+
+ MVT VT = Input.getSimpleValueType();
+ MVT RootVT = Root.getSimpleValueType();
+ SDLoc DL(Root);
+
+ if (Mask.size() == 1) {
+ int Index = Mask[0];
+ assert((Index >= 0 || Index == SM_SentinelUndef ||
+ Index == SM_SentinelZero) &&
+ "Invalid shuffle index found!");
+
+ // We may end up with an accumulated mask of size 1 as a result of
+ // widening of shuffle operands (see function canWidenShuffleElements).
+ // If the only shuffle index is equal to SM_SentinelZero then propagate
+ // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
+ // mask, and therefore the entire chain of shuffles can be folded away.
+ if (Index == SM_SentinelZero)
+ DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
+ else
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Use the float domain if the operand type is a floating point type.
+ bool FloatDomain = VT.isFloatingPoint();
+
+ // For floating point shuffles, we don't have free copies in the shuffle
+ // instructions or the ability to load as part of the instruction, so
+ // canonicalize their shuffles to UNPCK or MOV variants.
+ //
+ // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
+ // vectors because it can have a load folded into it that UNPCK cannot. This
+ // doesn't preclude something switching to the shorter encoding post-RA.
+ //
+ // FIXME: Should teach these routines about AVX vector widths.
+ if (FloatDomain && VT.is128BitVector()) {
+ if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
+ bool Lo = Mask.equals({0, 0});
+ unsigned Shuffle;
+ MVT ShuffleVT;
+ // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+ // is no slower than UNPCKLPD but has the option to fold the input operand
+ // into even an unaligned memory load.
+ if (Lo && Subtarget->hasSSE3()) {
+ Shuffle = X86ISD::MOVDDUP;
+ ShuffleVT = MVT::v2f64;
+ } else {
+ // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+ // than the UNPCK variants.
+ Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+ ShuffleVT = MVT::v4f32;
+ }
+ if (Depth == 1 && Root->getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ Op = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Op.getNode());
+ if (Shuffle == X86ISD::MOVDDUP)
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+ else
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+ DCI.AddToWorklist(Op.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ /*AddTo*/ true);
+ return true;
+ }
+ if (Subtarget->hasSSE3() &&
+ (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
+ bool Lo = Mask.equals({0, 0, 2, 2});
+ unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
+ MVT ShuffleVT = MVT::v4f32;
+ if (Depth == 1 && Root->getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ Op = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Op.getNode());
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+ DCI.AddToWorklist(Op.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ /*AddTo*/ true);
+ return true;
+ }
+ if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
+ bool Lo = Mask.equals({0, 0, 1, 1});
+ unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ MVT ShuffleVT = MVT::v4f32;
+ if (Depth == 1 && Root->getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ Op = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Op.getNode());
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+ DCI.AddToWorklist(Op.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
+ // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
+ // variants as none of these have single-instruction variants that are
+ // superior to the UNPCK formulation.
+ if (!FloatDomain && VT.is128BitVector() &&
+ (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals(
+ {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
+ bool Lo = Mask[0] == 0;
+ unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ if (Depth == 1 && Root->getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ MVT ShuffleVT;
+ switch (Mask.size()) {
+ case 8:
+ ShuffleVT = MVT::v8i16;
+ break;
+ case 16:
+ ShuffleVT = MVT::v16i8;
+ break;
+ default:
+ llvm_unreachable("Impossible mask size!");
+ };
+ Op = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Op.getNode());
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+ DCI.AddToWorklist(Op.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Don't try to re-form single instruction chains under any circumstances now
+ // that we've done encoding canonicalization for them.
+ if (Depth < 2)
+ return false;
+
+ // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
+ // can replace them with a single PSHUFB instruction profitably. Intel's
+ // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
+ // in practice PSHUFB tends to be *very* fast so we're more aggressive.
+ if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
+ SmallVector<SDValue, 16> PSHUFBMask;
+ int NumBytes = VT.getSizeInBits() / 8;
+ int Ratio = NumBytes / Mask.size();
+ for (int i = 0; i < NumBytes; ++i) {
+ if (Mask[i / Ratio] == SM_SentinelUndef) {
+ PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
+ int M = Mask[i / Ratio] != SM_SentinelZero
+ ? Ratio * Mask[i / Ratio] + i % Ratio
+ : 255;
+ PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+ }
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Op = DAG.getBitcast(ByteVT, Input);
+ DCI.AddToWorklist(Op.getNode());
+ SDValue PSHUFBMaskOp =
+ DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
+ DCI.AddToWorklist(PSHUFBMaskOp.getNode());
+ Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
+ DCI.AddToWorklist(Op.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Failed to find any combines.
+ return false;
+}
+
+/// \brief Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consider all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+/// equivalent. In most cases, this is just an encoding size win, but
+/// sometimes we will collapse multiple generic shuffles into a single
+/// special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+/// instructions, and replace them with the slightly more expensive SSSE3
+/// PSHUFB instruction if available. We do this as the last combining step
+/// to ensure we avoid using PSHUFB if we can implement the shuffle with
+/// a suitable short sequence of other instructions. The PHUFB will either
+/// use a register or have to read from memory and so is slightly (but only
+/// slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
+ ArrayRef<int> RootMask,
+ int Depth, bool HasPSHUFB,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ // Bound the depth of our recursive combine because this is ultimately
+ // quadratic in nature.
+ if (Depth > 8)
+ return false;
+
+ // Directly rip through bitcasts to find the underlying operand.
+ while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
+ Op = Op.getOperand(0);
+
+ MVT VT = Op.getSimpleValueType();
+ if (!VT.isVector())
+ return false; // Bail if we hit a non-vector.
+
+ assert(Root.getSimpleValueType().isVector() &&
+ "Shuffles operate on vector types!");
+ assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+ "Can only combine shuffles of the same vector register size.");
+
+ if (!isTargetShuffle(Op.getOpcode()))
+ return false;
+ SmallVector<int, 16> OpMask;
+ bool IsUnary;
+ bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
+ // We only can combine unary shuffles which we can decode the mask for.
+ if (!HaveMask || !IsUnary)
+ return false;
+
+ assert(VT.getVectorNumElements() == OpMask.size() &&
+ "Different mask size from vector size!");
+ assert(((RootMask.size() > OpMask.size() &&
+ RootMask.size() % OpMask.size() == 0) ||
+ (OpMask.size() > RootMask.size() &&
+ OpMask.size() % RootMask.size() == 0) ||
+ OpMask.size() == RootMask.size()) &&
+ "The smaller number of elements must divide the larger.");
+ int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
+ int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
+ assert(((RootRatio == 1 && OpRatio == 1) ||
+ (RootRatio == 1) != (OpRatio == 1)) &&
+ "Must not have a ratio for both incoming and op masks!");
+
+ SmallVector<int, 16> Mask;
+ Mask.reserve(std::max(OpMask.size(), RootMask.size()));
+
+ // Merge this shuffle operation's mask into our accumulated mask. Note that
+ // this shuffle's mask will be the first applied to the input, followed by the
+ // root mask to get us all the way to the root value arrangement. The reason
+ // for this order is that we are recursing up the operation chain.
+ for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
+ int RootIdx = i / RootRatio;
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask.push_back(RootMask[RootIdx]);
+ continue;
+ }
+
+ int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+ int OpIdx = RootMaskedIdx / OpRatio;
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask.push_back(OpMask[OpIdx]);
+ continue;
+ }
+
+ // Ok, we have non-zero lanes, map them through.
+ Mask.push_back(OpMask[OpIdx] * OpRatio +
+ RootMaskedIdx % OpRatio);
+ }
+
+ // See if we can recurse into the operand to combine more things.
+ switch (Op.getOpcode()) {
+ case X86ISD::PSHUFB:
+ HasPSHUFB = true;
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ if (Op.getOperand(0).hasOneUse() &&
+ combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+ HasPSHUFB, DAG, DCI, Subtarget))
+ return true;
+ break;
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ assert(Op.getOperand(0) == Op.getOperand(1) &&
+ "We only combine unary shuffles!");
+ // We can't check for single use, we have to check that this shuffle is the
+ // only user.
+ if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+ combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+ HasPSHUFB, DAG, DCI, Subtarget))
+ return true;
+ break;
+ }
+
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with squential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ WidenedMask.clear();
+ }
+
+ return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
+ Subtarget);
+}
+
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+ bool IsUnary;
+ bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
+ (void)HaveMask;
+ assert(HaveMask);
+
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ return Mask;
+ case X86ISD::PSHUFLW:
+ Mask.resize(4);
+ return Mask;
+ case X86ISD::PSHUFHW:
+ Mask.erase(Mask.begin(), Mask.begin() + 4);
+ for (int &M : Mask)
+ M -= 4;
+ return Mask;
+ default:
+ llvm_unreachable("No valid shuffle instruction found!");
+ }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static SDValue
+combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N.getOpcode() == X86ISD::PSHUFD &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+
+ // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
+ // of the shuffles in the chain so that we can form a fresh chain to replace
+ // this one.
+ SmallVector<SDValue, 8> Chain;
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return SDValue(); // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFD:
+ // Found another dword shuffle.
+ break;
+
+ case X86ISD::PSHUFLW:
+ // Check that the low words (being shuffled) are the identity in the
+ // dword shuffle, and the high words are self-contained.
+ if (Mask[0] != 0 || Mask[1] != 1 ||
+ !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+ return SDValue();
+
+ Chain.push_back(V);
+ continue;
+
+ case X86ISD::PSHUFHW:
+ // Check that the high words (being shuffled) are the identity in the
+ // dword shuffle, and the low words are self-contained.
+ if (Mask[2] != 2 || Mask[3] != 3 ||
+ !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+ return SDValue();
+
+ Chain.push_back(V);
+ continue;
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+ // shuffle into a preceding word shuffle.
+ if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+ V.getSimpleValueType().getVectorElementType() != MVT::i16)
+ return SDValue();
+
+ // Search for a half-shuffle which we can combine with.
+ unsigned CombineOp =
+ V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ if (V.getOperand(0) != V.getOperand(1) ||
+ !V->isOnlyUserOf(V.getOperand(0).getNode()))
+ return SDValue();
+ Chain.push_back(V);
+ V = V.getOperand(0);
+ do {
+ switch (V.getOpcode()) {
+ default:
+ return SDValue(); // Nothing to combine.
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOp)
+ break;
+
+ Chain.push_back(V);
+
+ // Fallthrough!
+ case ISD::BITCAST:
+ V = V.getOperand(0);
+ continue;
+ }
+ break;
+ } while (V.hasOneUse());
+ break;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return SDValue();
+
+ // Merge this node's mask and our incoming mask.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Rebuild the chain around this new shuffle.
+ while (!Chain.empty()) {
+ SDValue W = Chain.pop_back_val();
+
+ if (V.getValueType() != W.getOperand(0).getValueType())
+ V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
+
+ switch (W.getOpcode()) {
+ default:
+ llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
+ break;
+
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
+ break;
+ }
+ }
+ if (V.getValueType() != N.getValueType())
+ V = DAG.getBitcast(N.getValueType(), V);
+
+ // Return the new chain to replace N.
+ return V;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(
+ (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+ unsigned CombineOpcode = N.getOpcode();
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOpcode)
+ break;
+
+ // Other-half shuffles are no-ops.
+ continue;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Combine away the bottom node as its shuffle will be accumulated into
+ // a preceding shuffle.
+ DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Record the old value.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask (adjusted to account for all
+ // the pshufd instructions encountered).
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Check that the shuffles didn't cancel each other out. If not, we need to
+ // combine to the new one.
+ if (Old != V)
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+
+ return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ Mask = getPSHUFShuffleMask(N);
+ assert(Mask.size() == 4);
+ break;
+ case X86ISD::UNPCKL: {
+ // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
+ // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
+ // moves upper half elements into the lower half part. For example:
+ //
+ // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
+ // undef:v16i8
+ // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
+ //
+ // will be combined to:
+ //
+ // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
+
+ // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
+ // happen due to advanced instructions.
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ auto Op0 = N.getOperand(0);
+ auto Op1 = N.getOperand(1);
+ if (Op0.getOpcode() == ISD::UNDEF &&
+ Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 8> ExpectedMask(NumElts, -1);
+ std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
+ NumElts / 2);
+
+ auto ShufOp = Op1.getOperand(0);
+ if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
+ }
+ return SDValue();
+ }
+ case X86ISD::BLENDI: {
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
+ "Unexpected input vector types");
+
+ // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+ // operands and changing the mask to 1. This saves us a bunch of
+ // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+ // x86InstrInfo knows how to commute this back after instruction selection
+ // if it would help register allocation.
+
+ // TODO: If optimizing for size or a processor that doesn't suffer from
+ // partial register update stalls, this should be transformed into a MOVSD
+ // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+ if (VT == MVT::v2f64)
+ if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+ SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+ }
+
+ return SDValue();
+ }
+ default:
+ return SDValue();
+ }
+
+ // Nuke no-op shuffles that show up after combining.
+ if (isNoopShuffleMask(Mask))
+ return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Look for simplifications involving one or two shuffle instructions.
+ SDValue V = N.getOperand(0);
+ switch (N.getOpcode()) {
+ default:
+ break;
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
+
+ if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle, so we're done.
+
+ // See if this reduces to a PSHUFD which is no more expensive and can
+ // combine with more operations. Note that it has to at least flip the
+ // dwords as otherwise it would have been removed as a no-op.
+ if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
+ int DMask[] = {0, 1, 2, 3};
+ int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+ DMask[DOffset + 0] = DOffset + 1;
+ DMask[DOffset + 1] = DOffset + 0;
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getBitcast(DVT, V);
+ DCI.AddToWorklist(V.getNode());
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
+ getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getBitcast(VT, V);
+ }
+
+ // Look for shuffle patterns which can be implemented as a single unpack.
+ // FIXME: This doesn't handle the location of the PSHUFD generically, and
+ // only works when we have a PSHUFD followed by two half-shuffles.
+ if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+ (V.getOpcode() == X86ISD::PSHUFLW ||
+ V.getOpcode() == X86ISD::PSHUFHW) &&
+ V.getOpcode() != N.getOpcode() &&
+ V.hasOneUse()) {
+ SDValue D = V.getOperand(0);
+ while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+ D = D.getOperand(0);
+ if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+ int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int WordMask[8];
+ for (int i = 0; i < 4; ++i) {
+ WordMask[i + NOffset] = Mask[i] + NOffset;
+ WordMask[i + VOffset] = VMask[i] + VOffset;
+ }
+ // Map the word mask through the DWord mask.
+ int MappedMask[8];
+ for (int i = 0; i < 8; ++i)
+ MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+ if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
+ // We can replace all three shuffles with an unpack.
+ V = DAG.getBitcast(VT, D.getOperand(0));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+ : X86ISD::UNPCKH,
+ DL, VT, V, V);
+ }
+ }
+ }
+
+ break;
+
+ case X86ISD::PSHUFD:
+ if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+ return NewN;
+
+ break;
+ }
+
+ return SDValue();
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub node.
+///
+/// We combine this directly on the abstract vector shuffle nodes so it is
+/// easier to generically match. We also insert dummy vector shuffle nodes for
+/// the operands which explicitly discard the lanes which are unused by this
+/// operation to try to flow through the rest of the combiner the fact that
+/// they're unused.
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
+
+ // We only handle target-independent shuffles.
+ // FIXME: It would be easy and harmless to use the target shuffle mask
+ // extraction tool to support more.
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ auto *SVN = cast<ShuffleVectorSDNode>(N);
+ SmallVector<int, 8> Mask;
+ for (int M : SVN->getMask())
+ Mask.push_back(M);
+
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ // We require the first shuffle operand to be the FSUB node, and the second to
+ // be the FADD node.
+ if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
+ return SDValue();
+
+ // If there are other uses of these operations we can't fold them.
+ if (!V1->hasOneUse() || !V2->hasOneUse())
+ return SDValue();
+
+ // Ensure that both operations have the same operands. Note that we can
+ // commute the FADD operands.
+ SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
+ if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+ (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+ return SDValue();
+
+ // We're looking for blends between FADD and FSUB nodes. We insist on these
+ // nodes being lined up in a specific expected pattern.
+ if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // Don't create instructions with illegal types after legalize types has run.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+ return SDValue();
+
+ // If we have legalized the vector types, look for blends of FADD and FSUB
+ // nodes that we can fuse into an ADDSUB node.
+ if (TLI.isTypeLegal(VT))
+ if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+ return AddSub;
+
+ // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+ if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE)
+ return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+
+ // During Type Legalization, when promoting illegal vector types,
+ // the backend might introduce new shuffle dag nodes and bitcasts.
+ //
+ // This code performs the following transformation:
+ // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+ // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+ //
+ // We do this only if both the bitcast and the BINOP dag nodes have
+ // one use. Also, perform this transformation only if the new binary
+ // operation is legal. This is to avoid introducing dag nodes that
+ // potentially need to be further expanded (or custom lowered) into a
+ // less optimal sequence of dag nodes.
+ if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+ N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+ N0.getOpcode() == ISD::BITCAST) {
+ SDValue BC0 = N0.getOperand(0);
+ EVT SVT = BC0.getValueType();
+ unsigned Opcode = BC0.getOpcode();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (BC0.hasOneUse() && SVT.isVector() &&
+ SVT.getVectorNumElements() * 2 == NumElts &&
+ TLI.isOperationLegal(Opcode, VT)) {
+ bool CanFold = false;
+ switch (Opcode) {
+ default : break;
+ case ISD::ADD :
+ case ISD::FADD :
+ case ISD::SUB :
+ case ISD::FSUB :
+ case ISD::MUL :
+ case ISD::FMUL :
+ CanFold = true;
+ }
+
+ unsigned SVTNumElts = SVT.getVectorNumElements();
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+ for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) < 0;
+
+ if (CanFold) {
+ SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+ SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
+ SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+ return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+ }
+ }
+ }
+
+ // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+ // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+ // consecutive, non-overlapping, and in the right order.
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+ Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+ return LD;
+
+ if (isTargetShuffle(N->getOpcode())) {
+ SDValue Shuffle =
+ PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+
+ // Try recursively combining arbitrary sequences of x86 shuffle
+ // instructions into higher-order shuffles. We do this after combining
+ // specific PSHUF instruction sequences into their minimal form so that we
+ // can evaluate how many specialized shuffle instructions are involved in
+ // a particular chain.
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
+ /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been custom lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue InVec = N->getOperand(0);
+ SDValue EltNo = N->getOperand(1);
+
+ if (!isa<ConstantSDNode>(EltNo))
+ return SDValue();
+
+ EVT OriginalVT = InVec.getValueType();
+
+ if (InVec.getOpcode() == ISD::BITCAST) {
+ // Don't duplicate a load with other uses.
+ if (!InVec.hasOneUse())
+ return SDValue();
+ EVT BCVT = InVec.getOperand(0).getValueType();
+ if (!BCVT.isVector() ||
+ BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ return SDValue();
+ InVec = InVec.getOperand(0);
+ }
+
+ EVT CurrentVT = InVec.getValueType();
+
+ if (!isTargetShuffle(InVec.getOpcode()))
+ return SDValue();
+
+ // Don't duplicate a load with other uses.
+ if (!InVec.hasOneUse())
+ return SDValue();
+
+ SmallVector<int, 16> ShuffleMask;
+ bool UnaryShuffle;
+ if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+ ShuffleMask, UnaryShuffle))
+ return SDValue();
+
+ // Select the input vector, guarding against out of range extract vector.
+ unsigned NumElems = CurrentVT.getVectorNumElements();
+ int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+ int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+ SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+ : InVec.getOperand(1);
+
+ // If inputs to shuffle are the same for both ops, then allow 2 uses
+ unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+ InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+ if (LdNode.getOpcode() == ISD::BITCAST) {
+ // Don't duplicate a load with other uses.
+ if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+ return SDValue();
+
+ AllowedUses = 1; // only allow 1 load use if we have a bitcast
+ LdNode = LdNode.getOperand(0);
+ }
+
+ if (!ISD::isNormalLoad(LdNode.getNode()))
+ return SDValue();
+
+ LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+ if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+ return SDValue();
+
+ EVT EltVT = N->getValueType(0);
+ // If there's a bitcast before the shuffle, check if the load type and
+ // alignment is valid.
+ unsigned Align = LN0->getAlignment();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+ EltVT.getTypeForEVT(*DAG.getContext()));
+
+ if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
+ return SDValue();
+
+ // All checks match so transform back to vector_shuffle so that DAG combiner
+ // can finish the job
+ SDLoc dl(N);
+
+ // Create shuffle node taking into account the case that its a unary shuffle
+ SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
+ : InVec.getOperand(1);
+ Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
+ InVec.getOperand(0), Shuffle,
+ &ShuffleMask[0]);
+ Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+ EltNo);
+}
+
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+ // special and don't usually play with other vector types, it's better to
+ // handle them early to be sure we emit efficient code by avoiding
+ // store-load conversions.
+ if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+ N0.getValueType() == MVT::v2i32 &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0->getOperand(0);
+ if (N00.getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+ }
+
+ // Convert a bitcasted integer logic operation that has one bitcasted
+ // floating-point operand and one constant operand into a floating-point
+ // logic operation. This may create a load of the constant, but that is
+ // cheaper than materializing the constant in an integer register and
+ // transferring it to an SSE register or transferring the SSE operand to
+ // integer register and back.
+ unsigned FPOpcode;
+ switch (N0.getOpcode()) {
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ default: return SDValue();
+ }
+ if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
+ (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+ isa<ConstantSDNode>(N0.getOperand(1)) &&
+ N0.getOperand(0).getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getOperand(0).getValueType() == VT) {
+ SDValue N000 = N0.getOperand(0).getOperand(0);
+ SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
+ return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+ }
+
+ return SDValue();
+}
+
+/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
+/// generation and convert it from being a bunch of shuffles and extracts
+/// into a somewhat faster sequence. For i686, the best sequence is apparently
+/// storing the value and loading scalars back, while for x64 we should
+/// use 64-bit extracts and shifts.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+ return NewOp;
+
+ SDValue InputVector = N->getOperand(0);
+ SDLoc dl(InputVector);
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32) {
+
+ // The bitcast source is a direct mmx result.
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+ if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+ MMXSrc.getValueType() == MVT::i64) {
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0), MMXSrcOp.getOperand(0));
+ }
+ }
+
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
+ InputVector.getOpcode() == ISD::BITCAST &&
+ isa<ConstantSDNode>(InputVector.getOperand(0))) {
+ uint64_t ExtractedElt =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ uint64_t InputValue =
+ cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ uint64_t Res = (InputValue >> ExtractedElt) & 1;
+ return DAG.getConstant(Res, dl, MVT::i1);
+ }
+ // Only operate on vectors of 4 elements, where the alternative shuffling
+ // gets to be more expensive.
+ if (InputVector.getValueType() != MVT::v4i32)
+ return SDValue();
+
+ // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+ // single use which is a sign-extend or zero-extend, and all elements are
+ // used.
+ SmallVector<SDNode *, 4> Uses;
+ unsigned ExtractedElements = 0;
+ for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+ UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != InputVector.getResNo())
+ return SDValue();
+
+ SDNode *Extract = *UI;
+ if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (Extract->getValueType(0) != MVT::i32)
+ return SDValue();
+ if (!Extract->hasOneUse())
+ return SDValue();
+ if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+ Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+ return SDValue();
+
+ // Record which element was extracted.
+ ExtractedElements |=
+ 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+ Uses.push_back(Extract);
+ }
+
+ // If not all the elements were used, this may not be worthwhile.
+ if (ExtractedElements != 15)
+ return SDValue();
+
+ // Ok, we've now decided to do the transformation.
+ // If 64-bit shifts are legal, use the extract-shift sequence,
+ // otherwise bounce the vector off the cache.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Vals[4];
+
+ if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+ SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
+ auto &DL = DAG.getDataLayout();
+ EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
+ SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(0, dl, VecIdxTy));
+ SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(1, dl, VecIdxTy));
+
+ SDValue ShAmt = DAG.getConstant(
+ 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
+ Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+ Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+ Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+ Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+ } else {
+ // Store the value to a temporary stack slot.
+ SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+ MachinePointerInfo(), false, false, 0);
+
+ EVT ElementType = InputVector.getValueType().getVectorElementType();
+ unsigned EltSize = ElementType.getSizeInBits() / 8;
+
+ // Replace each use (extract) with a load of the appropriate element.
+ for (unsigned i = 0; i < 4; ++i) {
+ uint64_t Offset = EltSize * i;
+ auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
+
+ SDValue ScalarAddr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
+
+ // Load the scalar.
+ Vals[i] = DAG.getLoad(ElementType, dl, Ch,
+ ScalarAddr, MachinePointerInfo(),
+ false, false, false, 0);
+
+ }
+ }
+
+ // Replace the extracts
+ for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+ UE = Uses.end(); UI != UE; ++UI) {
+ SDNode *Extract = *UI;
+
+ SDValue Idx = Extract->getOperand(1);
+ uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
+ }
+
+ // The replacement was made in place; don't return anything.
+ return SDValue();
+}
+
+static SDValue
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue CondSrc = Cond->getOperand(0);
+ if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+ Cond = CondSrc->getOperand(0);
+ }
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+ return SDValue();
+
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> ShuffleMask(NumElems, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ // Be sure we emit undef where we can.
+ if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+ ShuffleMask[i] = -1;
+ else
+ ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+ }
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+ return SDValue();
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ SDValue Cond = N->getOperand(0);
+ // Get the LHS/RHS of the select.
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+ // instructions match the semantics of the common C idiom x<y?x:y but not
+ // x<=y?x:y, because of how they handle negative zero (which can be
+ // ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
+ if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+ VT != MVT::f80 && VT != MVT::f128 &&
+ (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+ (Subtarget->hasSSE2() ||
+ (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ // Converting this to a min would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETULE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a max would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a min would handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETULT:
+ // Converting this to a max would handle NaNs incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETULE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+ }
+
+ EVT CondVT = Cond.getValueType();
+ if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+ // lowering on KNL. In this case we convert it to
+ // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+ // The same situation for all 128 and 256-bit vectors of i8 and i16.
+ // Since SKX these selects have a proper lowering.
+ EVT OpVT = LHS.getValueType();
+ if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+ (OpVT.getVectorElementType() == MVT::i8 ||
+ OpVT.getVectorElementType() == MVT::i16) &&
+ !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
+ Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+ DCI.AddToWorklist(Cond.getNode());
+ return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+ }
+ }
+ // If this is a select between two integer constants, try to do some
+ // optimizations.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+ // Don't do this for crazy integer types.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 &&
+ TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, DL, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, DL,
+ Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+ }
+ }
+
+ // Canonicalize max and min:
+ // (x > y) ? x : y -> (x >= y) ? x : y
+ // (x < y) ? x : y -> (x <= y) ? x : y
+ // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+ // the need for an extra compare
+ // against zero. e.g.
+ // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+ // subl %esi, %edi
+ // testl %edi, %edi
+ // movl $0, %eax
+ // cmovgl %edi, %eax
+ // =>
+ // xorl %eax, %eax
+ // subl %esi, $edi
+ // cmovsl %eax, %edi
+ if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ switch (CC) {
+ default: break;
+ case ISD::SETLT:
+ case ISD::SETGT: {
+ ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ }
+ }
+ }
+
+ // Early exit check
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ // Match VSELECTs into subs with unsigned saturation.
+ if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+ ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+ // left side invert the predicate to simplify logic below.
+ SDValue Other;
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ Other = RHS;
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ Other = LHS;
+ }
+
+ if (Other.getNode() && Other->getNumOperands() == 2 &&
+ DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+ SDValue CondRHS = Cond->getOperand(1);
+
+ // Look for a general sub with unsigned saturation first.
+ // x >= y ? x-y : 0 --> subus x, y
+ // x > y ? x-y : 0 --> subus x, y
+ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+ Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+ if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+ if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+ if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+ if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+ // If the RHS is a constant we have to reverse the const
+ // canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ CondRHSConst->getAPIntValue() ==
+ (-OpRHSConst->getAPIntValue() - 1))
+ return DAG.getNode(
+ X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use computeKnownBits to determine
+ // whether it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ OpRHSConst->getAPIntValue().isSignBit())
+ // Note that we have to rebuild the RHS constant here to ensure we
+ // don't rely on particular values of undef lanes.
+ return DAG.getNode(
+ X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
+ }
+ }
+ }
+
+ // Simplify vector selection if condition value type matches vselect
+ // operand type
+ if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
+ assert(Cond.getValueType().isVector() &&
+ "vector select expects a vector selector!");
+
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ // Try invert the condition if true value is not all 1s and false value
+ // is not all 0s.
+ if (!TValIsAllOnes && !FValIsAllZeros &&
+ // Check if the selector will be produced by CMPP*/PCMP*
+ Cond.getOpcode() == ISD::SETCC &&
+ // Check if SETCC has already been promoted
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+ if (TValIsAllZeros || FValIsAllOnes) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode NewCC =
+ ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ Cond.getOperand(0).getValueType().isInteger());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ std::swap(LHS, RHS);
+ TValIsAllOnes = FValIsAllOnes;
+ FValIsAllZeros = TValIsAllZeros;
+ }
+ }
+
+ if (TValIsAllOnes || FValIsAllZeros) {
+ SDValue Ret;
+
+ if (TValIsAllOnes && FValIsAllZeros)
+ Ret = Cond;
+ else if (TValIsAllOnes)
+ Ret =
+ DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
+ else if (FValIsAllZeros)
+ Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
+ DAG.getBitcast(CondVT, LHS));
+
+ return DAG.getBitcast(VT, Ret);
+ }
+ }
+
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
+ // If this is a *dynamic* select (non-constant condition) and we can match
+ // this node with one of the variable blend instructions, restructure the
+ // condition so that the blends can use the high bit of each element and use
+ // SimplifyDemandedBits to simplify the condition operand.
+ if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+ !DCI.isBeforeLegalize() &&
+ !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+ unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
+
+ // Don't optimize vector selects that map to mask-registers.
+ if (BitWidth == 1)
+ return SDValue();
+
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getVectorElementType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.is128BitVector() && !Subtarget->hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
+ return SDValue();
+
+ assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+ APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+ TLO)) {
+ // If we changed the computation somewhere in the DAG, this change
+ // will affect all users of Cond.
+ // Make sure it is fine and update all the nodes so that we do not
+ // use the generic VSELECT anymore. Otherwise, we may perform
+ // wrong optimizations as we messed up with the actual expectation
+ // for the vector boolean values.
+ if (Cond != TLO.Old) {
+ // Check all uses of that condition operand to check whether it will be
+ // consumed by non-BLEND instructions, which may depend on all bits are
+ // set properly.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ if (I->getOpcode() != ISD::VSELECT)
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ return SDValue();
+
+ // Update all the users of the condition, before committing the change,
+ // so that the VSELECT optimizations that expect the correct vector
+ // boolean value will not be triggered.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(*I, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+ Cond, I->getOperand(1), I->getOperand(2)));
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue();
+ }
+ // At this point, only Cond is changed. Change the condition
+ // just for N to keep the opportunity to optimize all other
+ // users their own way.
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(N, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+ TLO.New, N->getOperand(1), N->getOperand(2)));
+ return SDValue();
+ }
+ }
+
+ return SDValue();
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+ // Quit if not CMP and SUB with its value result used.
+ if (Cmp.getOpcode() != X86ISD::CMP &&
+ (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+ return SDValue();
+
+ // Quit if not used as a boolean value.
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ // Check CMP operands. One of them should be 0 or 1 and the other should be
+ // an SetCC or extended from it.
+ SDValue Op1 = Cmp.getOperand(0);
+ SDValue Op2 = Cmp.getOperand(1);
+
+ SDValue SetCC;
+ const ConstantSDNode* C = nullptr;
+ bool needOppositeCond = (CC == X86::COND_E);
+ bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+ if ((C = dyn_cast<ConstantSDNode>(Op1)))
+ SetCC = Op2;
+ else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+ SetCC = Op1;
+ else // Quit if all operands are not constants.
+ return SDValue();
+
+ if (C->getZExtValue() == 1) {
+ needOppositeCond = !needOppositeCond;
+ checkAgainstTrue = true;
+ } else if (C->getZExtValue() != 0)
+ // Quit if the constant is neither 0 or 1.
+ return SDValue();
+
+ bool truncatedToBoolWithAnd = false;
+ // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+ while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+ SetCC.getOpcode() == ISD::TRUNCATE ||
+ SetCC.getOpcode() == ISD::AND) {
+ if (SetCC.getOpcode() == ISD::AND) {
+ int OpIdx = -1;
+ if (isOneConstant(SetCC.getOperand(0)))
+ OpIdx = 1;
+ if (isOneConstant(SetCC.getOperand(1)))
+ OpIdx = 0;
+ if (OpIdx == -1)
+ break;
+ SetCC = SetCC.getOperand(OpIdx);
+ truncatedToBoolWithAnd = true;
+ } else
+ SetCC = SetCC.getOperand(0);
+ }
+
+ switch (SetCC.getOpcode()) {
+ case X86ISD::SETCC_CARRY:
+ // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+ // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+ // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+ // truncated to i1 using 'and'.
+ if (checkAgainstTrue && !truncatedToBoolWithAnd)
+ break;
+ assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+ "Invalid use of SETCC_CARRY!");
+ // FALL THROUGH
+ case X86ISD::SETCC:
+ // Set the condition code or opposite one if necessary.
+ CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(1);
+ case X86ISD::CMOV: {
+ // Check whether false/true value has canonical one, i.e. 0 or 1.
+ ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+ ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+ // Quit if true value is not a constant.
+ if (!TVal)
+ return SDValue();
+ // Quit if false value is not a constant.
+ if (!FVal) {
+ SDValue Op = SetCC.getOperand(0);
+ // Skip 'zext' or 'trunc' node.
+ if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+ Op.getOpcode() == ISD::TRUNCATE)
+ Op = Op.getOperand(0);
+ // A special case for rdrand/rdseed, where 0 is set if false cond is
+ // found.
+ if ((Op.getOpcode() != X86ISD::RDRAND &&
+ Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+ return SDValue();
+ }
+ // Quit if false value is not the constant 0 or 1.
+ bool FValIsFalse = true;
+ if (FVal && FVal->getZExtValue() != 0) {
+ if (FVal->getZExtValue() != 1)
+ return SDValue();
+ // If FVal is 1, opposite cond is needed.
+ needOppositeCond = !needOppositeCond;
+ FValIsFalse = false;
+ }
+ // Quit if TVal is not the constant opposite of FVal.
+ if (FValIsFalse && TVal->getZExtValue() != 1)
+ return SDValue();
+ if (!FValIsFalse && TVal->getZExtValue() != 0)
+ return SDValue();
+ CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(3);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+/// (X86or (X86setcc) (X86setcc))
+/// (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+ X86::CondCode &CC1, SDValue &Flags,
+ bool &isAnd) {
+ if (Cond->getOpcode() == X86ISD::CMP) {
+ if (!isNullConstant(Cond->getOperand(1)))
+ return false;
+
+ Cond = Cond->getOperand(0);
+ }
+
+ isAnd = false;
+
+ SDValue SetCC0, SetCC1;
+ switch (Cond->getOpcode()) {
+ default: return false;
+ case ISD::AND:
+ case X86ISD::AND:
+ isAnd = true;
+ // fallthru
+ case ISD::OR:
+ case X86ISD::OR:
+ SetCC0 = Cond->getOperand(0);
+ SetCC1 = Cond->getOperand(1);
+ break;
+ };
+
+ // Make sure we have SETCC nodes, using the same flags value.
+ if (SetCC0.getOpcode() != X86ISD::SETCC ||
+ SetCC1.getOpcode() != X86ISD::SETCC ||
+ SetCC0->getOperand(1) != SetCC1->getOperand(1))
+ return false;
+
+ CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+ CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+ Flags = SetCC0->getOperand(1);
+ return true;
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+
+ // If the flag operand isn't dead, don't touch this CMOV.
+ if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+ return SDValue();
+
+ SDValue FalseOp = N->getOperand(0);
+ SDValue TrueOp = N->getOperand(1);
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+ SDValue Cond = N->getOperand(3);
+
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ switch (Cond.getOpcode()) {
+ default: break;
+ case X86ISD::BSR:
+ case X86ISD::BSF:
+ // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+ if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+ return (CC == X86::COND_E) ? FalseOp : TrueOp;
+ }
+ }
+
+ SDValue Flags;
+
+ Flags = checkBoolTestSetCCCombine(Cond, CC);
+ if (Flags.getNode() &&
+ // Extra check as FCMOV only supports a subset of X86 cond.
+ (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+ SDValue Ops[] = { FalseOp, TrueOp,
+ DAG.getConstant(CC, DL, MVT::i8), Flags };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, DL, MVT::i8));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+ }
+ }
+ }
+
+ // Handle these cases:
+ // (select (x != c), e, c) -> select (x != c), e, x),
+ // (select (x == c), c, e) -> select (x == c), x, e)
+ // where the c is an integer constant, and the "select" is the combination
+ // of CMOV and CMP.
+ //
+ // The rationale for this change is that the conditional-move from a constant
+ // needs two instructions, however, conditional-move from a register needs
+ // only one instruction.
+ //
+ // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+ // some instruction-combining opportunities. This opt needs to be
+ // postponed as late as possible.
+ //
+ if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+ // the DCI.xxxx conditions are provided to postpone the optimization as
+ // late as possible.
+
+ ConstantSDNode *CmpAgainst = nullptr;
+ if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+ (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+ !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+ if (CC == X86::COND_NE &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ if (CC == X86::COND_E &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+ SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+ DAG.getConstant(CC, DL, MVT::i8), Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
+ }
+ }
+ }
+
+ // Fold and/or of setcc's to double CMOV:
+ // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+ // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+ //
+ // This combine lets us generate:
+ // cmovcc1 (jcc1 if we don't have CMOV)
+ // cmovcc2 (same)
+ // instead of:
+ // setcc1
+ // setcc2
+ // and/or
+ // cmovne (jne if we don't have CMOV)
+ // When we can't use the CMOV instruction, it might increase branch
+ // mispredicts.
+ // When we can use CMOV, or when there is no mispredict, this improves
+ // throughput and reduces register pressure.
+ //
+ if (CC == X86::COND_NE) {
+ SDValue Flags;
+ X86::CondCode CC0, CC1;
+ bool isAndSetCC;
+ if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+ if (isAndSetCC) {
+ std::swap(FalseOp, TrueOp);
+ CC0 = X86::GetOppositeBranchCondition(CC0);
+ CC1 = X86::GetOppositeBranchCondition(CC1);
+ }
+
+ SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
+ Flags};
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+ return CMOV;
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // An imul is usually smaller than the alternative sequence.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64 && VT != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ uint64_t MulAmt = C->getZExtValue();
+ if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return SDValue();
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((MulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = MulAmt / 9;
+ } else if ((MulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = MulAmt / 5;
+ } else if ((MulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = MulAmt / 3;
+ }
+
+ SDLoc DL(N);
+ SDValue NewMul;
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add.
+ std::swap(MulAmt1, MulAmt2);
+
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, DL, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, DL, VT));
+ }
+
+ if (!NewMul) {
+ assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+ && "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ if (isPowerOf2_64(MulAmt - 1))
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt - 1), DL,
+ MVT::i8)));
+
+ else if (isPowerOf2_64(MulAmt + 1))
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+ N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt + 1),
+ DL, MVT::i8)), N->getOperand(0));
+ }
+
+ if (NewMul)
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, NewMul, false);
+
+ return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+ // since the result of setcc_c is all zero's or all ones.
+ if (VT.isInteger() && !VT.isVector() &&
+ N1C && N0.getOpcode() == ISD::AND &&
+ N0.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue N00 = N0.getOperand(0);
+ APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt ShAmt = N1C->getAPIntValue();
+ Mask = Mask.shl(ShAmt);
+ bool MaskOK = false;
+ // We can handle cases concerning bit-widening nodes containing setcc_c if
+ // we carefully interrogate the mask to make sure we are semantics
+ // preserving.
+ // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+ // of the underlying setcc_c operation if the setcc_c was zero extended.
+ // Consider the following example:
+ // zext(setcc_c) -> i32 0x0000FFFF
+ // c1 -> i32 0x0000FFFF
+ // c2 -> i32 0x00000001
+ // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+ // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+ N00.getOpcode() == ISD::ANY_EXTEND) &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+ }
+ if (MaskOK && Mask != 0) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
+ }
+ }
+
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl.
+ // (shl V, 1) -> add V,V
+ if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+ if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1SplatC->getAPIntValue() == 1)
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned Size = VT.getSizeInBits();
+
+ // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+ // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+ // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+ // depending on sign of (SarConst - [56,48,32,24,16])
+
+ // sexts in X86 are MOVs. The MOVs have the same code size
+ // as above SHIFTs (only SHIFT on 1 has lower code size).
+ // However the MOVs have 2 advantages to a SHIFT:
+ // 1. MOVs can write to a register that differs from source
+ // 2. MOVs accept memory operands
+
+ if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+ N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+ N0.getOperand(1).getOpcode() != ISD::Constant)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+ APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+ EVT CVT = N1.getValueType();
+
+ if (SarConst.isNegative())
+ return SDValue();
+
+ for (MVT SVT : MVT::integer_valuetypes()) {
+ unsigned ShiftSize = SVT.getSizeInBits();
+ // skipping types without corresponding sext/zext and
+ // ShlConst that is not one of [56,48,32,24,16]
+ if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+ continue;
+ SDLoc DL(N);
+ SDValue NN =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+ SarConst = SarConst - (Size - ShiftSize);
+ if (SarConst == 0)
+ return NN;
+ else if (SarConst.isNegative())
+ return DAG.getNode(ISD::SHL, DL, VT, NN,
+ DAG.getConstant(-SarConst, DL, CVT));
+ else
+ return DAG.getNode(ISD::SRA, DL, VT, NN,
+ DAG.getConstant(SarConst, DL, CVT));
+ }
+ return SDValue();
+}
+
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+ (!Subtarget->hasInt256() ||
+ (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+ return SDValue();
+
+ SDValue Amt = N->getOperand(1);
+ SDLoc DL(N);
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+ if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+ APInt ShiftAmt = AmtSplat->getAPIntValue();
+ unsigned MaxAmount =
+ VT.getSimpleVT().getVectorElementType().getSizeInBits();
+
+ // SSE2/AVX2 logical shifts always return a vector of 0s
+ // if the shift amount is bigger than or equal to
+ // the element size. The constant shift amount will be
+ // encoded as a 8-bit immediate.
+ if (ShiftAmt.trunc(8).uge(MaxAmount))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
+ }
+
+ return SDValue();
+}
+
+/// PerformShiftCombine - Combine shifts.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (N->getOpcode() == ISD::SHL)
+ if (SDValue V = PerformSHLCombine(N, DAG))
+ return V;
+
+ if (N->getOpcode() == ISD::SRA)
+ if (SDValue V = PerformSRACombine(N, DAG))
+ return V;
+
+ // Try to fold this logical shift into a zero vector.
+ if (N->getOpcode() != ISD::SRA)
+ if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
+ return V;
+
+ return SDValue();
+}
+
+// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends. Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ unsigned opcode;
+
+ // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+ // we're requiring SSE2 for both.
+ if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue CMP0 = N0->getOperand(1);
+ SDValue CMP1 = N1->getOperand(1);
+ SDLoc DL(N);
+
+ // The SETCCs should both refer to the same CMP.
+ if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+ return SDValue();
+
+ SDValue CMP00 = CMP0->getOperand(0);
+ SDValue CMP01 = CMP0->getOperand(1);
+ EVT VT = CMP00.getValueType();
+
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ bool ExpectingFlags = false;
+ // Check for any users that want flags:
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+ !ExpectingFlags && UI != UE; ++UI)
+ switch (UI->getOpcode()) {
+ default:
+ case ISD::BR_CC:
+ case ISD::BRCOND:
+ case ISD::SELECT:
+ ExpectingFlags = true;
+ break;
+ case ISD::CopyToReg:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ break;
+ }
+
+ if (!ExpectingFlags) {
+ enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+ enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+ if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+ X86::CondCode tmp = cc0;
+ cc0 = cc1;
+ cc1 = tmp;
+ }
+
+ if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
+ (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+ // FIXME: need symbolic constants for these magic numbers.
+ // See X86ATTInstPrinter.cpp:printSSECC().
+ unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+ if (Subtarget->hasAVX512()) {
+ SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
+ CMP01,
+ DAG.getConstant(x86cc, DL, MVT::i8));
+ if (N->getValueType(0) != MVT::i1)
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+ FSetCC);
+ return FSetCC;
+ }
+ SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+ CMP00.getValueType(), CMP00, CMP01,
+ DAG.getConstant(x86cc, DL,
+ MVT::i8));
+
+ bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+ MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+ if (is64BitFP && !Subtarget->is64Bit()) {
+ // On a 32-bit target, we cannot bitcast the 64-bit float to a
+ // 64-bit integer, since that's not a legal type. Since
+ // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+ // bits, but can do this little dance to extract the lowest 32 bits
+ // and work with those going forward.
+ SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ OnesOrZeroesF);
+ SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
+ OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+ Vector32, DAG.getIntPtrConstant(0, DL));
+ IntVT = MVT::i32;
+ }
+
+ SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
+ SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+ DAG.getConstant(1, DL, IntVT));
+ SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ ANDed);
+ return OneBitOfTruth;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ // Match direct AllOnes for 128 and 256-bit vectors
+ if (ISD::isBuildVectorAllOnes(N))
+ return true;
+
+ // Look through a bit convert.
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0).getNode();
+
+ // Sometimes the operand may come from a insert_subvector building a 256-bit
+ // allones vector
+ if (VT.is256BitVector() &&
+ N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+ ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+ ISD::isBuildVectorAllOnes(V2.getNode()))
+ return true;
+ }
+
+ return false;
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ assert((N->getOpcode() == ISD::ANY_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+ SDValue Narrow = N->getOperand(0);
+ EVT NarrowVT = Narrow->getValueType(0);
+ if (!NarrowVT.is128BitVector())
+ return SDValue();
+
+ if (Narrow->getOpcode() != ISD::XOR &&
+ Narrow->getOpcode() != ISD::AND &&
+ Narrow->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = Narrow->getOperand(0);
+ SDValue N1 = Narrow->getOperand(1);
+ SDLoc DL(Narrow);
+
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ EVT WideVT = N0->getOperand(0)->getValueType(0);
+ if (WideVT != VT)
+ return SDValue();
+
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+ ConstantSDNode *RHSConstSplat = nullptr;
+ if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+ RHSConstSplat = RHSBV->getConstantSplatNode();
+ if (!RHSTrunc && !RHSConstSplat)
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+ return SDValue();
+
+ // Set N0 and N1 to hold the inputs to the new wide operation.
+ N0 = N0->getOperand(0);
+ if (RHSConstSplat) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
+ SDValue(RHSConstSplat, 0));
+ SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
+ } else if (RHSTrunc) {
+ N1 = N1->getOperand(0);
+ }
+
+ // Generate the wide operation.
+ SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND: {
+ unsigned InBits = NarrowVT.getScalarSizeInBits();
+ APInt Mask = APInt::getAllOnesValue(InBits);
+ Mask = Mask.zext(VT.getScalarSizeInBits());
+ return DAG.getNode(ISD::AND, DL, VT,
+ Op, DAG.getConstant(Mask, DL, VT));
+ }
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+ Op, DAG.getValueType(NarrowVT));
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // A vector zext_in_reg may be represented as a shuffle,
+ // feeding into a bitcast (this represents anyext) feeding into
+ // an and with a mask.
+ // We'd like to try to combine that into a shuffle with zero
+ // plus a bitcast, removing the and.
+ if (N0.getOpcode() != ISD::BITCAST ||
+ N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ // The other side of the AND should be a splat of 2^C, where C
+ // is the number of bits in the source type.
+ if (N1.getOpcode() == ISD::BITCAST)
+ N1 = N1.getOperand(0);
+ if (N1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+ BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
+
+ ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
+ EVT SrcType = Shuffle->getValueType(0);
+
+ // We expect a single-source shuffle
+ if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ unsigned SrcSize = SrcType.getScalarSizeInBits();
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!Vector->isConstantSplat(SplatValue, SplatUndef,
+ SplatBitSize, HasAnyUndefs))
+ return SDValue();
+
+ unsigned ResSize = N1.getValueType().getScalarSizeInBits();
+ // Make sure the splat matches the mask we expect
+ if (SplatBitSize > ResSize ||
+ (SplatValue + 1).exactLogBase2() != (int)SrcSize)
+ return SDValue();
+
+ // Make sure the input and output size make sense
+ if (SrcSize >= ResSize || ResSize % SrcSize)
+ return SDValue();
+
+ // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
+ // The number of u's between each two values depends on the ratio between
+ // the source and dest type.
+ unsigned ZextRatio = ResSize / SrcSize;
+ bool IsZext = true;
+ for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+ if (i % ZextRatio) {
+ if (Shuffle->getMaskElt(i) > 0) {
+ // Expected undef
+ IsZext = false;
+ break;
+ }
+ } else {
+ if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
+ // Expected element number
+ IsZext = false;
+ break;
+ }
+ }
+ }
+
+ if (!IsZext)
+ return SDValue();
+
+ // Ok, perform the transformation - replace the shuffle with
+ // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
+ // (instead of undef) where the k elements come from the zero vector.
+ SmallVector<int, 8> Mask;
+ unsigned NumElems = SrcType.getVectorNumElements();
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (i % ZextRatio)
+ Mask.push_back(NumElems);
+ else
+ Mask.push_back(i / ZextRatio);
+
+ SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
+ Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
+ return DAG.getBitcast(N0.getValueType(), NewShuffle);
+}
+
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ unsigned FPOpcode = ISD::DELETED_NODE;
+ if (N->getOpcode() == ISD::AND)
+ FPOpcode = X86ISD::FAND;
+ else if (N->getOpcode() == ISD::OR)
+ FPOpcode = X86ISD::FOR;
+ else if (N->getOpcode() == ISD::XOR)
+ FPOpcode = X86ISD::FXOR;
+
+ assert(FPOpcode != ISD::DELETED_NODE &&
+ "Unexpected input node for FP logic conversion");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ ((Subtarget->hasSSE1() && VT == MVT::i32) ||
+ (Subtarget->hasSSE2() && VT == MVT::i64))) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+ if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
+ }
+ }
+ return SDValue();
+}
+
+static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
+ return Zext;
+
+ if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Create BEXTR instructions
+ // BEXTR is ((X >> imm) & (2**size-1))
+ if (VT == MVT::i32 || VT == MVT::i64) {
+ // Check for BEXTR.
+ if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+ (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+ ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (MaskNode && ShiftNode) {
+ uint64_t Mask = MaskNode->getZExtValue();
+ uint64_t Shift = ShiftNode->getZExtValue();
+ if (isMask_64(Mask)) {
+ uint64_t MaskSize = countPopulation(Mask);
+ if (Shift + MaskSize <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+ DAG.getConstant(Shift | (MaskSize << 8), DL,
+ VT));
+ }
+ }
+ } // BEXTR
+
+ return SDValue();
+ }
+
+ // Want to form ANDNP nodes:
+ // 1) In the hopes of then easily combining them with OR and AND nodes
+ // to form PBLEND/PSIGN.
+ // 2) To match ANDN packed intrinsics
+ if (VT != MVT::v2i64 && VT != MVT::v4i64)
+ return SDValue();
+
+ // Check LHS for vnot
+ if (N0.getOpcode() == ISD::XOR &&
+ //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+ CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+
+ // Check RHS for vnot
+ if (N1.getOpcode() == ISD::XOR &&
+ //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+ CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+
+ return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // look for psign/blend
+ if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+ if (!Subtarget->hasSSSE3() ||
+ (VT == MVT::v4i64 && !Subtarget->hasInt256()))
+ return SDValue();
+
+ // Canonicalize pandn to RHS
+ if (N0.getOpcode() == X86ISD::ANDNP)
+ std::swap(N0, N1);
+ // or (and (m, y), (pandn m, x))
+ if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+ SDValue Mask = N1.getOperand(0);
+ SDValue X = N1.getOperand(1);
+ SDValue Y;
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
+
+ // Check to see if the mask appeared in both the AND and ANDNP and
+ if (!Y.getNode())
+ return SDValue();
+
+ // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+ // Look through mask bitcast.
+ if (Mask.getOpcode() == ISD::BITCAST)
+ Mask = Mask.getOperand(0);
+ if (X.getOpcode() == ISD::BITCAST)
+ X = X.getOperand(0);
+ if (Y.getOpcode() == ISD::BITCAST)
+ Y = Y.getOperand(0);
+
+ EVT MaskVT = Mask.getValueType();
+
+ // Validate that the Mask operand is a vector sra node.
+ // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+ // there is no psrai.b
+ unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+ unsigned SraAmt = ~0;
+ if (Mask.getOpcode() == ISD::SRA) {
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+ if (auto *AmtConst = AmtBV->getConstantSplatNode())
+ SraAmt = AmtConst->getZExtValue();
+ } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+ SDValue SraC = Mask.getOperand(1);
+ SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+ }
+ if ((SraAmt + 1) != EltBits)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Now we know we at least have a plendvb with the mask val. See if
+ // we can form a psignb/w/d.
+ // psign = x.type == y.type == mask.type && y = sub(0, x);
+ if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+ ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+ X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Unsupported VT for PSIGN");
+ Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+ return DAG.getBitcast(VT, Mask);
+ }
+ // PBLENDVB only available on SSE 4.1
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+ X = DAG.getBitcast(BlendVT, X);
+ Y = DAG.getBitcast(BlendVT, Y);
+ Mask = DAG.getBitcast(BlendVT, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ return DAG.getBitcast(VT, Mask);
+ }
+ }
+
+ if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+ bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+ // SHLD/SHRD instructions have lower register pressure, but on some
+ // platforms they have higher latency than the equivalent
+ // series of shifts/or that would otherwise be generated.
+ // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+ // have higher latencies and we are not optimizing for size.
+ if (!OptForSize && Subtarget->isSHLDSlow())
+ return SDValue();
+
+ if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+ std::swap(N0, N1);
+ if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+ return SDValue();
+ if (!N0.hasOneUse() || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue ShAmt0 = N0.getOperand(1);
+ if (ShAmt0.getValueType() != MVT::i8)
+ return SDValue();
+ SDValue ShAmt1 = N1.getOperand(1);
+ if (ShAmt1.getValueType() != MVT::i8)
+ return SDValue();
+ if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+ ShAmt0 = ShAmt0.getOperand(0);
+ if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+ ShAmt1 = ShAmt1.getOperand(0);
+
+ SDLoc DL(N);
+ unsigned Opc = X86ISD::SHLD;
+ SDValue Op0 = N0.getOperand(0);
+ SDValue Op1 = N1.getOperand(0);
+ if (ShAmt0.getOpcode() == ISD::SUB) {
+ Opc = X86ISD::SHRD;
+ std::swap(Op0, Op1);
+ std::swap(ShAmt0, ShAmt1);
+ }
+
+ unsigned Bits = VT.getSizeInBits();
+ if (ShAmt1.getOpcode() == ISD::SUB) {
+ SDValue Sum = ShAmt1.getOperand(0);
+ if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+ SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+ if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+ ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+ if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+ return DAG.getNode(Opc, DL, VT,
+ Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+ } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+ ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+ if (ShAmt0C &&
+ ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+ return DAG.getNode(Opc, DL, VT,
+ N0.getOperand(0), N1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+
+ return SDValue();
+}
+
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ if (VT.isInteger() && VT.getSizeInBits() == 8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CMOV.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD &&
+ N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA &&
+ N1.getOperand(0) == N0.getOperand(0))
+ if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+ if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+ // Generate SUB & CMOV.
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, DL, VT), N0.getOperand(0));
+
+ SDValue Ops[] = { N0.getOperand(0), Neg,
+ DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1) };
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
+ }
+ return SDValue();
+}
+
+// Try to turn tests against the signbit in the form of:
+// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+// into:
+// SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+ // This is only worth doing if the output type is i8.
+ if (N->getValueType(0) != MVT::i8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // We should be performing an xor against a truncated shift.
+ if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+ return SDValue();
+
+ // Make sure we are performing an xor against one.
+ if (!isOneConstant(N1))
+ return SDValue();
+
+ // SetCC on x86 zero extends so only act on this if it's a logical shift.
+ SDValue Shift = N0.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+ return SDValue();
+
+ // Make sure we are truncating from one of i16, i32 or i64.
+ EVT ShiftTy = Shift.getValueType();
+ if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+ return SDValue();
+
+ // Make sure the shift amount extracts the sign bit.
+ if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+ return SDValue();
+
+ // Create a greater-than comparison against -1.
+ // N.B. Using SETGE against 0 works but we want a canonical looking
+ // comparison, using SETGT matches up with what TranslateX86CC.
+ SDLoc DL(N);
+ SDValue ShiftOp = Shift.getOperand(0);
+ EVT ShiftOpTy = ShiftOp.getValueType();
+ SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+ DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+ return Cond;
+}
+
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+ return RV;
+
+ if (Subtarget->hasCMov())
+ if (SDValue RV = performIntegerAbsCombine(N, DAG))
+ return RV;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ return SDValue();
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget, SDLoc DL) {
+ if (!VT.isVector() || !VT.isSimple())
+ return SDValue();
+ EVT InVT = In.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT ScalarVT = VT.getVectorElementType();
+ if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+ isPowerOf2_32(NumElems)))
+ return SDValue();
+
+ // InScalarVT is the intermediate type in AVG pattern and it should be greater
+ // than the original input type (i8/i16).
+ EVT InScalarVT = InVT.getVectorElementType();
+ if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+ return SDValue();
+
+ if (Subtarget->hasAVX512()) {
+ if (VT.getSizeInBits() > 512)
+ return SDValue();
+ } else if (Subtarget->hasAVX2()) {
+ if (VT.getSizeInBits() > 256)
+ return SDValue();
+ } else {
+ if (VT.getSizeInBits() > 128)
+ return SDValue();
+ }
+
+ // Detect the following pattern:
+ //
+ // %1 = zext <N x i8> %a to <N x i32>
+ // %2 = zext <N x i8> %b to <N x i32>
+ // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+ // %4 = add nuw nsw <N x i32> %3, %2
+ // %5 = lshr <N x i32> %N, <i32 1 x N>
+ // %6 = trunc <N x i32> %5 to <N x i8>
+ //
+ // In AVX512, the last instruction can also be a trunc store.
+
+ if (In.getOpcode() != ISD::SRL)
+ return SDValue();
+
+ // A lambda checking the given SDValue is a constant vector and each element
+ // is in the range [Min, Max].
+ auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV || !BV->isConstant())
+ return false;
+ for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+ if (!C)
+ return false;
+ uint64_t Val = C->getZExtValue();
+ if (Val < Min || Val > Max)
+ return false;
+ }
+ return true;
+ };
+
+ // Check if each element of the vector is left-shifted by one.
+ auto LHS = In.getOperand(0);
+ auto RHS = In.getOperand(1);
+ if (!IsConstVectorInRange(RHS, 1, 1))
+ return SDValue();
+ if (LHS.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // Detect a pattern of a + b + 1 where the order doesn't matter.
+ SDValue Operands[3];
+ Operands[0] = LHS.getOperand(0);
+ Operands[1] = LHS.getOperand(1);
+
+ // Take care of the case when one of the operands is a constant vector whose
+ // element is in the range [1, 256].
+ if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+ Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+ Operands[0].getOperand(0).getValueType() == VT) {
+ // The pattern is detected. Subtract one from the constant vector, then
+ // demote it and emit X86ISD::AVG instruction.
+ SDValue One = DAG.getConstant(1, DL, InScalarVT);
+ SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
+ SmallVector<SDValue, 8>(NumElems, One));
+ Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+ Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+ return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+ Operands[1]);
+ }
+
+ if (Operands[0].getOpcode() == ISD::ADD)
+ std::swap(Operands[0], Operands[1]);
+ else if (Operands[1].getOpcode() != ISD::ADD)
+ return SDValue();
+ Operands[2] = Operands[1].getOperand(0);
+ Operands[1] = Operands[1].getOperand(1);
+
+ // Now we have three operands of two additions. Check that one of them is a
+ // constant vector with ones, and the other two are promoted from i8/i16.
+ for (int i = 0; i < 3; ++i) {
+ if (!IsConstVectorInRange(Operands[i], 1, 1))
+ continue;
+ std::swap(Operands[i], Operands[2]);
+
+ // Check if Operands[0] and Operands[1] are results of type promotion.
+ for (int j = 0; j < 2; ++j)
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ // The pattern is detected, emit X86ISD::AVG instruction.
+ return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+ Operands[1].getOperand(0));
+ }
+
+ return SDValue();
+}
+
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ EVT RegVT = Ld->getValueType(0);
+ EVT MemVT = Ld->getMemoryVT();
+ SDLoc dl(Ld);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations.
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+ bool Fast;
+ unsigned AddressSpace = Ld->getAddressSpace();
+ unsigned Alignment = Ld->getAlignment();
+ if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+ Ext == ISD::NON_EXTLOAD &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+ AddressSpace, Alignment, &Fast) && !Fast) {
+ unsigned NumElems = RegVT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment =
+ DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
+
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ NumElems/2);
+ SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Alignment);
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ std::min(16U, Alignment));
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue NewVec = DAG.getUNDEF(RegVT);
+ NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+ NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+ return DCI.CombineTo(N, NewVec, TF, true);
+ }
+
+ return SDValue();
+}
+
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ return SDValue();
+
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert Src0 value
+ SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
+ if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+ DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ }
+ // Prepare the new mask
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+ ShuffleVec[i] = NumElems * SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, dl, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WideSrc0,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+ if (!Mst->isTruncatingStore())
+ return SDValue();
+
+ EVT VT = Mst->getValue().getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegal(VT, StVT))
+ return SDValue();
+
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ SDValue NewMask;
+ SDValue Mask = Mst->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, dl, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+ Mst->getBasePtr(), NewMask, StVT,
+ Mst->getMemOperand(), false);
+}
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT VT = St->getValue().getValueType();
+ EVT StVT = St->getMemoryVT();
+ SDLoc dl(St);
+ SDValue StoredVal = St->getOperand(1);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we are saving a concatenation of two XMM registers and 32-byte stores
+ // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+ bool Fast;
+ unsigned AddressSpace = St->getAddressSpace();
+ unsigned Alignment = St->getAlignment();
+ if (VT.is256BitVector() && StVT == VT &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ AddressSpace, Alignment, &Fast) && !Fast) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+ SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+
+ SDValue Stride =
+ DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+ SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), Alignment);
+ SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(),
+ std::min(16U, Alignment));
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store.
+ // First, pack all of the elements in one place. Next, store to memory
+ // in fewer chunks.
+ if (St->isTruncatingStore() && VT.isVector()) {
+ // Check if we can detect an AVG pattern from the truncation. If yes,
+ // replace the trunc store by a normal store with the result of X86ISD::AVG
+ // instruction.
+ SDValue Avg =
+ detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
+ if (Avg.getNode())
+ return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegal(VT, StVT))
+ return SDValue();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+ unsigned SizeRatio = FromSz / ToSz;
+
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+ StoreType = Tp;
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Ptr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(i, dl));
+ SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ Chains.push_back(Ch);
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ }
+
+ // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
+ // the FP state in cases where an emms may be missing.
+ // A preferable solution to the general problem is to figure out the right
+ // places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
+ bool F64IsLegal =
+ !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
+ if ((VT.isVector() ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+ isa<LoadSDNode>(St->getValue()) &&
+ !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+ St->getChain().hasOneUse() && !St->isVolatile()) {
+ SDNode* LdVal = St->getValue().getNode();
+ LoadSDNode *Ld = nullptr;
+ int TokenFactorIndex = -1;
+ SmallVector<SDValue, 8> Ops;
+ SDNode* ChainVal = St->getChain().getNode();
+ // Must be a store of a load. We currently handle two cases: the load
+ // is a direct child, and it's under an intervening TokenFactor. It is
+ // possible to dig deeper under nested TokenFactors.
+ if (ChainVal == LdVal)
+ Ld = cast<LoadSDNode>(St->getChain());
+ else if (St->getValue().hasOneUse() &&
+ ChainVal->getOpcode() == ISD::TokenFactor) {
+ for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+ if (ChainVal->getOperand(i).getNode() == LdVal) {
+ TokenFactorIndex = i;
+ Ld = cast<LoadSDNode>(St->getValue());
+ } else
+ Ops.push_back(ChainVal->getOperand(i));
+ }
+ }
+
+ if (!Ld || !ISD::isNormalLoad(Ld))
+ return SDValue();
+
+ // If this is not the MMX case, i.e. we are just turning i64 load/store
+ // into f64 load/store, avoid the transformation if there are multiple
+ // uses of the loaded value.
+ if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
+
+ SDLoc LdDL(Ld);
+ SDLoc StDL(N);
+ // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+ // pair instead.
+ if (Subtarget->is64Bit() || F64IsLegal) {
+ MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Ld->getAlignment());
+ SDValue NewChain = NewLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(NewChain);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+ }
+ return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+ St->getPointerInfo(),
+ St->isVolatile(), St->isNonTemporal(),
+ St->getAlignment());
+ }
+
+ // Otherwise, lower to two pairs of 32-bit loads / stores.
+ SDValue LoAddr = Ld->getBasePtr();
+ SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, LdDL, MVT::i32));
+
+ SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+ Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+ Ld->getPointerInfo().getWithOffset(4),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(),
+ MinAlign(Ld->getAlignment(), 4));
+
+ SDValue NewChain = LoLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(LoLd);
+ Ops.push_back(HiLd);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+ }
+
+ LoAddr = St->getBasePtr();
+ HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, StDL, MVT::i32));
+
+ SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+ St->getPointerInfo(),
+ St->isVolatile(), St->isNonTemporal(),
+ St->getAlignment());
+ SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+ St->getPointerInfo().getWithOffset(4),
+ St->isVolatile(),
+ St->isNonTemporal(),
+ MinAlign(St->getAlignment(), 4));
+ return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ }
+
+ // This is similar to the above case, but here we handle a scalar 64-bit
+ // integer store that is extracted from a vector on a 32-bit target.
+ // If we have SSE2, then we can treat it like a floating-point double
+ // to get past legalization. The execution dependencies fixup pass will
+ // choose the optimal machine instruction for the store if this really is
+ // an integer or v2f32 rather than an f64.
+ if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
+ St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue OldExtract = St->getOperand(1);
+ SDValue ExtOp0 = OldExtract.getOperand(0);
+ unsigned VecSize = ExtOp0.getValueSizeInBits();
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
+ SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ BitCast, OldExtract.getOperand(1));
+ return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ }
+
+ return SDValue();
+}
+
+/// Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS. A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector. For example, if
+/// A = < float a0, float a1, float a2, float a3 >
+/// and
+/// B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+ // Look for the following pattern: if
+ // A = < float a0, float a1, float a2, float a3 >
+ // B = < float b0, float b1, float b2, float b3 >
+ // and
+ // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+ // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+ // which is A horizontal-op B.
+
+ // At least one of the operands should be a vector shuffle.
+ if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+ RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ MVT VT = LHS.getSimpleValueType();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for horizontal add/sub");
+
+ // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+ // operate independently on 128-bit lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ assert((NumLaneElts % 2 == 0) &&
+ "Vector type should have an even number of elements in each lane");
+ unsigned HalfLaneElts = NumLaneElts/2;
+
+ // View LHS in the form
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // If LHS is not a shuffle then pretend it is the shuffle
+ // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+ // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+ // type VT.
+ SDValue A, B;
+ SmallVector<int, 16> LMask(NumElts);
+ if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ A = LHS.getOperand(0);
+ if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ B = LHS.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), LMask.begin());
+ } else {
+ if (LHS.getOpcode() != ISD::UNDEF)
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask[i] = i;
+ }
+
+ // Likewise, view RHS in the form
+ // RHS = VECTOR_SHUFFLE C, D, RMask
+ SDValue C, D;
+ SmallVector<int, 16> RMask(NumElts);
+ if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ C = RHS.getOperand(0);
+ if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ D = RHS.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), RMask.begin());
+ } else {
+ if (RHS.getOpcode() != ISD::UNDEF)
+ C = RHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ RMask[i] = i;
+ }
+
+ // Check that the shuffles are both shuffling the same vectors.
+ if (!(A == C && B == D) && !(A == D && B == C))
+ return false;
+
+ // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+ if (!A.getNode() && !B.getNode())
+ return false;
+
+ // If A and B occur in reverse order in RHS, then "swap" them (which means
+ // rewriting the mask).
+ if (A != C)
+ ShuffleVectorSDNode::commuteMask(RMask);
+
+ // At this point LHS and RHS are equivalent to
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // RHS = VECTOR_SHUFFLE A, B, RMask
+ // Check that the masks correspond to performing a horizontal operation.
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+ // Ignore any UNDEF components.
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ continue;
+
+ // Check that successive elements are being operated on. If not, this is
+ // not a horizontal operation.
+ unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+ int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+ if (!(LIdx == Index && RIdx == Index + 1) &&
+ !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+ return false;
+ }
+ }
+
+ LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+ return true;
+}
+
+/// Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(LHS, RHS, true))
+ return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
+ return SDValue();
+}
+
+/// Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Try to synthesize horizontal subs from subs of shuffles.
+ if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(LHS, RHS, false))
+ return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
+ return SDValue();
+}
+
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+ Regs[0].getValueType() == MVT::v2i64));
+ EVT OutVT = N->getValueType(0);
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InVT = Regs[0].getValueType();
+ EVT InSVT = InVT.getVectorElementType();
+ SDLoc DL(N);
+
+ // First, use mask to unset all bits that won't appear in the result.
+ assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+ "OutSVT can only be either i8 or i16.");
+ SDValue MaskVal =
+ DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
+ SDValue MaskVec = DAG.getNode(
+ ISD::BUILD_VECTOR, DL, InVT,
+ SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+ for (auto &Reg : Regs)
+ Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+
+ MVT UnpackedVT, PackedVT;
+ if (OutSVT == MVT::i8) {
+ UnpackedVT = MVT::v8i16;
+ PackedVT = MVT::v16i8;
+ } else {
+ UnpackedVT = MVT::v4i32;
+ PackedVT = MVT::v8i16;
+ }
+
+ // In each iteration, truncate the type by a half size.
+ auto RegNum = Regs.size();
+ for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+ j < e; j *= 2, RegNum /= 2) {
+ for (unsigned i = 0; i < RegNum; i++)
+ Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+ for (unsigned i = 0; i < RegNum / 2; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+ Regs[i * 2 + 1]);
+ }
+
+ // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+ // then extract a subvector as the result since v8i8 is not a legal type.
+ if (OutVT == MVT::v8i8) {
+ Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+ Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+ DAG.getIntPtrConstant(0, DL));
+ return Regs[0];
+ } else if (RegNum > 1) {
+ Regs.resize(RegNum);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+ EVT OutVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+ SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+ for (auto &Reg : Regs) {
+ Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ }
+
+ for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+ Regs[i * 2 + 1]);
+
+ if (Regs.size() > 2) {
+ Regs.resize(Regs.size() / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// diffcult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ EVT InVT = In.getValueType();
+ unsigned NumElems = OutVT.getVectorNumElements();
+
+ // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+ // SSE2, and we need to take care of it specially.
+ // AVX512 provides vpmovdb.
+ if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+ return SDValue();
+
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InSVT = InVT.getVectorElementType();
+ if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+ NumElems >= 8))
+ return SDValue();
+
+ // SSSE3's pshufb results in less instructions in the cases below.
+ if (Subtarget->hasSSSE3() && NumElems == 8 &&
+ ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+ (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Split a long vector into vectors of legal type.
+ unsigned RegNum = InVT.getSizeInBits() / 128;
+ SmallVector<SDValue, 8> SubVec(RegNum);
+ if (InSVT == MVT::i32) {
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(i * 4, DL));
+ } else {
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(i * 2, DL));
+ }
+
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+ // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+ // truncate 2 x v4i32 to v8i16.
+ if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+ return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+ else if (InSVT == MVT::i32)
+ return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+ else
+ return SDValue();
+}
+
+static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // Try to detect AVG pattern first.
+ SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
+ Subtarget, SDLoc(N));
+ if (Avg.getNode())
+ return Avg;
+
+ return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ SDValue Arg = N->getOperand(0);
+ SDLoc DL(N);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ // If we're negating a FMUL node on a target with FMA, then we can avoid the
+ // use of a constant by performing (-0 - A*B) instead.
+ // FIXME: Check rounding control flags as well once it becomes available.
+ if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+ Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+ SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Zero);
+ }
+
+ // If we're negating a FMA node, then we can adjust the
+ // instruction to include the extra negation.
+ if (Arg.hasOneUse()) {
+ switch (Arg.getOpcode()) {
+ case X86ISD::FMADD:
+ return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FMSUB:
+ return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FNMADD:
+ return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FNMSUB:
+ return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ }
+ }
+ return SDValue();
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+ // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+ // These logic operations may be executed in the integer domain.
+ SDLoc dl(N);
+ MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+ SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+ unsigned IntOpcode = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected FP logic op");
+ case X86ISD::FOR: IntOpcode = ISD::OR; break;
+ case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+ case X86ISD::FAND: IntOpcode = ISD::AND; break;
+ case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+ }
+ SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+ return DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+ }
+ return SDValue();
+}
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
+ // F[X]OR(0.0, x) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+ // Only perform optimizations if UnsafeMath is used.
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+ // into FMINC and FMAXC, which are Commutative operations.
+ unsigned NewOp = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unknown opcode");
+ case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
+ case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
+ }
+
+ return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+}
+
+static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ if (Subtarget->useSoftFloat())
+ return SDValue();
+
+ // TODO: Check for global or instruction-level "nnan". In that case, we
+ // should be able to lower to FMAX/FMIN alone.
+ // TODO: If an operand is already known to be a NaN or not a NaN, this
+ // should be an optional swap and FMAX/FMIN.
+
+ EVT VT = N->getValueType(0);
+ if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ return SDValue();
+
+ // This takes at least 3 instructions, so favor a library call when operating
+ // on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+ DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ // There are 4 possibilities involving NaN inputs, and these are the required
+ // outputs:
+ // Op1
+ // Num NaN
+ // ----------------
+ // Num | Max | Op0 |
+ // Op0 ----------------
+ // NaN | Op1 | NaN |
+ // ----------------
+ //
+ // The SSE FP max/min instructions were not designed for this case, but rather
+ // to implement:
+ // Min = Op1 < Op0 ? Op1 : Op0
+ // Max = Op1 > Op0 ? Op1 : Op0
+ //
+ // So they always return Op0 if either input is a NaN. However, we can still
+ // use those instructions for fmaxnum by selecting away a NaN input.
+
+ // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+ SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+ // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+ // are NaN, the NaN value of Op1 is the result.
+ auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+ return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+/// Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // FAND(0.0, x) -> 0.0
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+
+ // FAND(x, 0.0) -> 0.0
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FANDN nodes
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // FANDN(0.0, x) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // BT ignores high bits in the bit index operand.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.hasOneUse()) {
+ unsigned BitWidth = Op1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+ if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+ VT.getVectorElementType().getSizeInBits() ==
+ OpVT.getVectorElementType().getSizeInBits()) {
+ return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+ }
+ return SDValue();
+}
+
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+ SDLoc dl(N);
+
+ // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+ // both SSE and AVX2 since there is no sign-extended shift right
+ // operation on a vector with 64-bit elements.
+ //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+ // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+ if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ SDValue N00 = N0.getOperand(0);
+
+ // EXTLOAD has a better solution on AVX2,
+ // it may be replaced with X86ISD::VSEXT node.
+ if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+ if (!ISD::isNormalLoad(N00.getNode()))
+ return SDValue();
+
+ if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+ SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+ N00, N1);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+ }
+ }
+ return SDValue();
+}
+
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
+/// to combine math ops, use an LEA, or use a complex addressing mode. This can
+/// eliminate extend, add, and shift instructions.
+static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // TODO: This should be valid for other integer types.
+ EVT VT = Sext->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ // We need an 'add nsw' feeding into the 'sext'.
+ SDValue Add = Sext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+ return SDValue();
+
+ // Having a constant operand to the 'add' ensures that we are not increasing
+ // the instruction count because the constant is extended for free below.
+ // A constant operand can also become the displacement field of an LEA.
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ if (!AddOp1)
+ return SDValue();
+
+ // Don't make the 'add' bigger if there's no hope of combining it with some
+ // other 'add' or 'shl' instruction.
+ // TODO: It may be profitable to generate simpler LEA instructions in place
+ // of single 'add' instructions, but the cost model for selecting an LEA
+ // currently has a high threshold.
+ bool HasLEAPotential = false;
+ for (auto *User : Sext->uses()) {
+ if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+ HasLEAPotential = true;
+ break;
+ }
+ }
+ if (!HasLEAPotential)
+ return SDValue();
+
+ // Everything looks good, so pull the 'sext' ahead of the 'add'.
+ int64_t AddConstant = AddOp1->getSExtValue();
+ SDValue AddOp0 = Add.getOperand(0);
+ SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+ SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+ // The wider add is guaranteed to not wrap because both operands are
+ // sign-extended.
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(true);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+}
+
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT InVT = N0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+ SDLoc DL(N);
+
+ // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
+ // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
+ // This exposes the sext to the sdivrem lowering, so that it directly extends
+ // from AH (which we otherwise need to do contortions to access).
+ if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
+ InVT == MVT::i8 && VT == MVT::i32) {
+ SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+ SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
+ N0.getOperand(0), N0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+ return R.getValue(1);
+ }
+
+ if (!DCI.isBeforeLegalizeOps()) {
+ if (InVT == MVT::i1) {
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue AllOnes =
+ DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+ }
+ return SDValue();
+ }
+
+ if (VT.isVector() && Subtarget->hasSSE2()) {
+ auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) {
+ EVT InVT = N.getValueType();
+ EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+ Size / InVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
+ DAG.getUNDEF(InVT));
+ Opnds[0] = N;
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ };
+
+ // If target-size is less than 128-bits, extend to a type that would extend
+ // to 128 bits, extend that and extract the original target vector.
+ if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ unsigned Scale = 128 / VT.getSizeInBits();
+ EVT ExVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+ SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
+ // which ensures lowering to X86ISD::VSEXT (pmovsx*).
+ if (VT.getSizeInBits() == 128 &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ SDValue ExOp = ExtendVecSize(DL, N0, 128);
+ return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
+ }
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::SIGN_EXTEND_VECTOR_INREG.
+ if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ unsigned NumVecs = VT.getSizeInBits() / 128;
+ unsigned NumSubElts = 128 / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+ SmallVector<SDValue, 8> Opnds;
+ for (unsigned i = 0, Offset = 0; i != NumVecs;
+ ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+ SrcVec = ExtendVecSize(DL, SrcVec, 128);
+ SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+ }
+ }
+
+ if (Subtarget->hasAVX() && VT.is256BitVector())
+ if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+ return NewAdd;
+
+ return SDValue();
+}
+
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
+ return SDValue();
+
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+ SDValue C = N->getOperand(2);
+
+ bool NegA = (A.getOpcode() == ISD::FNEG);
+ bool NegB = (B.getOpcode() == ISD::FNEG);
+ bool NegC = (C.getOpcode() == ISD::FNEG);
+
+ // Negative multiplication when NegA xor NegB
+ bool NegMul = (NegA != NegB);
+ if (NegA)
+ A = A.getOperand(0);
+ if (NegB)
+ B = B.getOperand(0);
+ if (NegC)
+ C = C.getOperand(0);
+
+ unsigned Opcode;
+ if (!NegMul)
+ Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+ else
+ Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+ return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
+ // (and (i32 x86isd::setcc_carry), 1)
+ // This eliminates the zext. This transformation is necessary because
+ // ISD::SETCC is always legalized to i8.
+ SDLoc dl(N);
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ if (N0.getOpcode() == ISD::AND &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ if (!isOneConstant(N0.getOperand(1)))
+ return SDValue();
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, dl, VT));
+ }
+ }
+
+ if (N0.getOpcode() == ISD::TRUNCATE &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, dl, VT));
+ }
+ }
+
+ if (VT.is256BitVector())
+ if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+ return R;
+
+ // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
+ // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
+ // This exposes the zext to the udivrem lowering, so that it directly extends
+ // from AH (which we otherwise need to do contortions to access).
+ if (N0.getOpcode() == ISD::UDIVREM &&
+ N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
+ VT == MVT::i32) {
+ SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+ SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
+ N0.getOperand(0), N0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+ return R.getValue(1);
+ }
+
+ return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+// x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+ if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+ LHS.getOperand(1));
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
+ }
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+ if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+ RHS.getOperand(1));
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
+ }
+
+ if (VT.getScalarType() == MVT::i1 &&
+ (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+ bool IsSEXT0 =
+ (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if (!IsSEXT0 || !IsVZero1) {
+ // Swap the operands and update the condition code.
+ std::swap(LHS, RHS);
+ CC = ISD::getSetCCSwappedOperands(CC);
+
+ IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+ }
+
+ if (IsSEXT0 && IsVZero1) {
+ assert(VT == LHS.getOperand(0).getValueType() &&
+ "Uexpected operand type");
+ if (CC == ISD::SETGT)
+ return DAG.getConstant(0, DL, VT);
+ if (CC == ISD::SETLE)
+ return DAG.getConstant(1, DL, VT);
+ if (CC == ISD::SETEQ || CC == ISD::SETGE)
+ return DAG.getNOT(DL, LHS.getOperand(0), VT);
+
+ assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+ "Unexpected condition code!");
+ return LHS.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ // Gather and Scatter instructions use k-registers for masks. The type of
+ // the masks is v*i1. So the mask will be truncated anyway.
+ // The SIGN_EXTEND_INREG my be dropped.
+ SDValue Mask = N->getOperand(2);
+ if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[2] = Mask.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ }
+ return SDValue();
+}
+
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
+ MVT VT) {
+ if (VT == MVT::i8)
+ return DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS),
+ DAG.getConstant(1, DL, VT));
+ assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS));
+}
+
+// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+ SDValue EFLAGS = N->getOperand(1);
+
+ if (CC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
+ }
+ }
+
+ // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+ // a zext and produces an all-ones bit which is more useful than 0/1 in some
+ // cases.
+ if (CC == X86::COND_B)
+ return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
+
+ if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+ SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+ return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+ }
+
+ return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest = N->getOperand(1);
+ SDValue EFLAGS = N->getOperand(3);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+ if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+ SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+ Flags);
+ }
+
+ return SDValue();
+}
+
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+ SelectionDAG &DAG) {
+ // Take advantage of vector comparisons producing 0 or -1 in each lane to
+ // optimize away operation when it's from a constant.
+ //
+ // The general transformation is:
+ // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+ // AND(VECTOR_CMP(x,y), constant2)
+ // constant2 = UNARYOP(constant)
+
+ // Early exit if this isn't a vector operation, the operand of the
+ // unary operation isn't a bitwise AND, or if the sizes of the operations
+ // aren't the same.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+ N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+ VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+ return SDValue();
+
+ // Now check that the other operand of the AND is a constant. We could
+ // make the transformation for non-constant splats as well, but it's unclear
+ // that would be a benefit as it would not eliminate any operations, just
+ // perform one more step in scalar code before moving to the vector unit.
+ if (BuildVectorSDNode *BV =
+ dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+ // Bail out if the vector isn't a constant.
+ if (!BV->isConstant())
+ return SDValue();
+
+ // Everything checks out. Build up the new and improved node.
+ SDLoc DL(N);
+ EVT IntVT = BV->getValueType(0);
+ // Create a new constant of the appropriate type for the transformed
+ // DAG.
+ SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+ // The AND node needs bitcasts to/from an integer vector type around it.
+ SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+ N->getOperand(0)->getOperand(0), MaskConst);
+ SDValue Res = DAG.getBitcast(VT, NewAnd);
+ return Res;
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = Op0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+ // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+ if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+ SDLoc dl(N);
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements());
+ SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+ if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+ return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // First try to optimize away the conversion entirely when it's
+ // conditionally from a constant. Vectors only.
+ if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
+ return Res;
+
+ // Now move on to more general possibilities.
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = Op0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+
+ // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
+ // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
+ if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+ SDLoc dl(N);
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements());
+ SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+ }
+
+ // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+ // a 32-bit target where SSE doesn't support i64->FP operations.
+ if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+ EVT LdVT = Ld->getValueType(0);
+
+ // This transformation is not supported if the result type is f16
+ if (VT == MVT::f16)
+ return SDValue();
+
+ if (!Ld->isVolatile() && !VT.isVector() &&
+ ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+ !Subtarget->is64Bit() && LdVT == MVT::i64) {
+ SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+ return FILDChain;
+ }
+ }
+ return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
+ X86TargetLowering::DAGCombinerInfo &DCI) {
+ // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+ // the result is either zero or one (depending on the input carry bit).
+ // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+ if (X86::isZeroNode(N->getOperand(0)) &&
+ X86::isZeroNode(N->getOperand(1)) &&
+ // We don't have a good way to replace an EFLAGS use, so only do this when
+ // dead right now.
+ SDValue(N, 1).use_empty()) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
+ SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL,
+ MVT::i8),
+ N->getOperand(2)),
+ DAG.getConstant(1, DL, VT));
+ return DCI.CombineTo(N, Res1, CarryOut);
+ }
+
+ return SDValue();
+}
+
+// fold (add Y, (sete X, 0)) -> adc 0, Y
+// (add Y, (setne X, 0)) -> sbb -1, Y
+// (sub (sete X, 0), Y) -> sbb 0, Y
+// (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // Look through ZExts.
+ SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+ if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+ return SDValue();
+
+ SDValue SetCC = Ext.getOperand(0);
+ if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+ return SDValue();
+
+ X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ SDValue Cmp = SetCC.getOperand(1);
+ if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+ !X86::isZeroNode(Cmp.getOperand(1)) ||
+ !Cmp.getOperand(0).getValueType().isInteger())
+ return SDValue();
+
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+ DAG.getConstant(1, DL, CmpOp0.getValueType()));
+
+ SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+ if (CC == X86::COND_NE)
+ return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+ DL, OtherVal.getValueType(), OtherVal,
+ DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
+ NewCmp);
+ return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+ DL, OtherVal.getValueType(), OtherVal,
+ DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+}
+
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // X86 can't encode an immediate LHS of a sub. See if we can push the
+ // negation into a preceding instruction.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+ // If the RHS of the sub is a XOR with one use and a constant, invert the
+ // immediate. Then add one to the LHS of the sub so we can turn
+ // X-Y -> X+~Y+1, saving one register.
+ if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+ isa<ConstantSDNode>(Op1.getOperand(1))) {
+ APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ EVT VT = Op0.getValueType();
+ SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
+ Op1.getOperand(0),
+ DAG.getConstant(~XorC, SDLoc(Op1), VT));
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
+ DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
+ }
+ }
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ EVT VT = N->getValueType(0);
+ if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N->getSimpleValueType(0);
+ SDValue Op = N->getOperand(0);
+ MVT OpVT = Op.getSimpleValueType();
+ MVT OpEltVT = OpVT.getVectorElementType();
+ unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
+ // (vzext (bitcast (vzext (x)) -> (vzext x)
+ SDValue V = Op;
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
+ MVT InnerVT = V.getSimpleValueType();
+ MVT InnerEltVT = InnerVT.getVectorElementType();
+
+ // If the element sizes match exactly, we can just do one larger vzext. This
+ // is always an exact type match as vzext operates on integer types.
+ if (OpEltVT == InnerEltVT) {
+ assert(OpVT == InnerVT && "Types must match for vzext!");
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+ }
+
+ // The only other way we can combine them is if only a single element of the
+ // inner vzext is used in the input to the outer vzext.
+ if (InnerEltVT.getSizeInBits() < InputBits)
+ return SDValue();
+
+ // In this case, the inner vzext is completely dead because we're going to
+ // only look at bits inside of the low element. Just do the outer vzext on
+ // a bitcast of the input to the inner.
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
+ }
+
+ // Check if we can bypass extracting and re-inserting an element of an input
+ // vector. Essentially:
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+ SDValue ExtractedV = V.getOperand(0);
+ SDValue OrigV = ExtractedV.getOperand(0);
+ if (isNullConstant(ExtractedV.getOperand(1))) {
+ MVT OrigVT = OrigV.getSimpleValueType();
+ // Extract a subvector if necessary...
+ if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+ int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+ OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+ OrigVT.getVectorNumElements() / Ratio);
+ OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ Op = DAG.getBitcast(OpVT, OrigV);
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+ case ISD::VSELECT:
+ case ISD::SELECT:
+ case X86ISD::SHRUNKBLEND:
+ return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget);
+ case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
+ case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
+ case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
+ case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
+ case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
+ case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
+ case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
+ case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
+ case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
+ case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
+ case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
+ case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget);
+ case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget);
+ case X86ISD::FXOR:
+ case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);
+ case X86ISD::FMIN:
+ case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG,
+ Subtarget);
+ case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget);
+ case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget);
+ case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
+ case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
+// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated.
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG:
+ return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+ case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
+ case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::SHUFP: // Handle all target specific shuffles
+ case X86ISD::PALIGNR:
+ case X86ISD::BLENDI:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLHPS:
+ case X86ISD::PSHUFB:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERM2X128:
+ case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
+ case ISD::MGATHER:
+ case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
+ }
+
+ return SDValue();
+}
+
+/// isTypeDesirableForOp - Return true if the target has native support for
+/// the specified value type and it is 'desirable' to use the type for the
+/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+/// instruction encodings are longer and some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+ if (!isTypeLegal(VT))
+ return false;
+ if (VT != MVT::i16)
+ return true;
+
+ switch (Opc) {
+ default:
+ return true;
+ case ISD::LOAD:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SUB:
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return false;
+ }
+}
+
+/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
+/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
+/// we don't adjust the stack we clobber the first frame index.
+/// See X86InstrInfo::copyPhysReg.
+bool X86TargetLowering::hasCopyImplyingStackAdjustment(
+ MachineFunction *MF) const {
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ return any_of(MRI.reg_instructions(X86::EFLAGS),
+ [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
+/// IsDesirableToPromoteOp - This method query the target whether it is
+/// beneficial for dag combiner to promote the specified node. If true, it
+/// should return the desired promotion type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+ EVT VT = Op.getValueType();
+ if (VT != MVT::i16)
+ return false;
+
+ bool Promote = false;
+ bool Commute = false;
+ switch (Op.getOpcode()) {
+ default: break;
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ // If the non-extending load has a single use and it's not live out, then it
+ // might be folded.
+ if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
+ Op.hasOneUse()*/) {
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+ // The only case where we'd want to promote LOAD (rather then it being
+ // promoted as an operand is when it's only use is liveout.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ }
+ }
+ Promote = true;
+ break;
+ }
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ Promote = true;
+ break;
+ case ISD::SHL:
+ case ISD::SRL: {
+ SDValue N0 = Op.getOperand(0);
+ // Look out for (store (shl (load), x)).
+ if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+ return false;
+ Promote = true;
+ break;
+ }
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ Commute = true;
+ // fallthrough
+ case ISD::SUB: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ if (!Commute && MayFoldLoad(N1))
+ return false;
+ // Avoid disabling potential load folding opportunities.
+ if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+ return false;
+ if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+ return false;
+ Promote = true;
+ }
+ }
+
+ PVT = MVT::i32;
+ return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+ S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
+
+ for (StringRef Piece : Pieces) {
+ if (!S.startswith(Piece)) // Check if the piece matches.
+ return false;
+
+ S = S.substr(Piece.size());
+ StringRef::size_type Pos = S.find_first_not_of(" \t");
+ if (Pos == 0) // We matched a prefix.
+ return false;
+
+ S = S.substr(Pos);
+ }
+
+ return S.empty();
+}
+
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+ if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+ if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+ if (AsmPieces.size() == 3)
+ return true;
+ else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+ std::string AsmStr = IA->getAsmString();
+
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, ";\n");
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ // FIXME: this should verify that we are targeting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical
+ // ops instead of emitting the bswap asm. For now, we don't support 486 or
+ // lower so don't worry about this.
+ // bswap $0
+ if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ // rorw $$8, ${0:w} --> llvm.bswap.i16
+ if (CI->getType()->isIntegerTy(16) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+ matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
+ AsmPieces.clear();
+ StringRef ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ break;
+ case 3:
+ if (CI->getType()->isIntegerTy(32) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+ matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+ matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
+ AsmPieces.clear();
+ StringRef ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ if (CI->getType()->isIntegerTy(64)) {
+ InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+ if (Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+ matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+ matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'f':
+ case 't':
+ case 'u':
+ case 'y':
+ case 'x':
+ case 'Y':
+ case 'l':
+ return C_RegisterClass;
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ return C_Register;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'G':
+ case 'C':
+ case 'e':
+ case 'Z':
+ return C_Other;
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ X86TargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ if (CallOperandVal->getType()->isIntegerTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'f':
+ case 't':
+ case 'u':
+ if (type->isFloatingPointTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'y':
+ if (type->isX86_MMXTy() && Subtarget->hasMMX())
+ weight = CW_SpecificReg;
+ break;
+ case 'x':
+ case 'Y':
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+ weight = CW_Register;
+ break;
+ case 'I':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+ if (C->getZExtValue() <= 31)
+ weight = CW_Constant;
+ }
+ break;
+ case 'J':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 63)
+ weight = CW_Constant;
+ }
+ break;
+ case 'K':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+ weight = CW_Constant;
+ }
+ break;
+ case 'L':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+ weight = CW_Constant;
+ }
+ break;
+ case 'M':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 3)
+ weight = CW_Constant;
+ }
+ break;
+ case 'N':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xff)
+ weight = CW_Constant;
+ }
+ break;
+ case 'G':
+ case 'C':
+ if (isa<ConstantFP>(CallOperandVal)) {
+ weight = CW_Constant;
+ }
+ break;
+ case 'e':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80000000LL) &&
+ (C->getSExtValue() <= 0x7fffffffLL))
+ weight = CW_Constant;
+ }
+ break;
+ case 'Z':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xffffffff)
+ weight = CW_Constant;
+ }
+ break;
+ }
+ return weight;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+ // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+ // 'f' like normal targets.
+ if (ConstraintVT.isFloatingPoint()) {
+ if (Subtarget->hasSSE2())
+ return "Y";
+ if (Subtarget->hasSSE1())
+ return "x";
+ }
+
+ return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 63) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'K':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isInt<8>(C->getSExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 255) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'e': {
+ // 32-bit signed value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getSExtValue())) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
+ break;
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ }
+ return;
+ }
+ case 'Z': {
+ // 32-bit unsigned value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ return;
+ }
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+ break;
+ }
+
+ // In any sort of PIC mode addresses need to be computed at runtime by
+ // adding in a register or some sort of table lookup. These can't
+ // be used as immediates.
+ if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+ return;
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ GlobalAddressSDNode *GA = nullptr;
+ int64_t Offset = 0;
+
+ // Match either (GA), (GA+C), (GA+C1+C2), etc.
+ while (1) {
+ if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+ Offset += GA->getOffset();
+ break;
+ } else if (Op.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ } else if (Op.getOpcode() == ISD::SUB) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += -C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ }
+
+ // Otherwise, this isn't something we can handle, reject it.
+ return;
+ }
+
+ const GlobalValue *GV = GA->getGlobal();
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(
+ Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
+ return;
+
+ Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
+ GA->getValueType(0), Offset);
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ // TODO: Slight differences here in allocation order and leaving
+ // RIP in the class. Do they matter any more here than they do
+ // in the normal allocation?
+ case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+ if (Subtarget->is64Bit()) {
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i64 || VT == MVT::f64)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
+ }
+ // 32-bit fallthrough
+ case 'Q': // Q_REGS
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+ break;
+ case 'r': // GENERAL_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, &X86::GR32RegClass);
+ return std::make_pair(0U, &X86::GR64RegClass);
+ case 'R': // LEGACY_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+ if (VT == MVT::i32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ case 'f': // FP Stack registers.
+ // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+ // value to the correct fpstack register class.
+ if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP32RegClass);
+ if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP64RegClass);
+ return std::make_pair(0U, &X86::RFP80RegClass);
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget->hasMMX()) break;
+ return std::make_pair(0U, &X86::VR64RegClass);
+ case 'Y': // SSE_REGS if SSE2 allowed
+ if (!Subtarget->hasSSE2()) break;
+ // FALL THROUGH.
+ case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+ if (!Subtarget->hasSSE1()) break;
+
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(0U, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(0U, &X86::FR64RegClass);
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+ // Vector types.
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(0U, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ return std::make_pair(0U, &X86::VR256RegClass);
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ return std::make_pair(0U, &X86::VR512RegClass);
+ }
+ break;
+ }
+ }
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+ // Not found as a standard register?
+ if (!Res.second) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' &&
+ tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' &&
+ Constraint[6] == '}') {
+
+ Res.first = X86::FP0+Constraint[4]-'0';
+ Res.second = &X86::RFP80RegClass;
+ return Res;
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint)) {
+ Res.first = X86::FP0;
+ Res.second = &X86::RFP80RegClass;
+ return Res;
+ }
+
+ // flags -> EFLAGS
+ if (StringRef("{flags}").equals_lower(Constraint)) {
+ Res.first = X86::EFLAGS;
+ Res.second = &X86::CCRRegClass;
+ return Res;
+ }
+
+ // 'A' means EAX + EDX.
+ if (Constraint == "A") {
+ Res.first = X86::EAX;
+ Res.second = &X86::GR32_ADRegClass;
+ return Res;
+ }
+ return Res;
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ // MVT::Other is used to specify clobber names.
+ if (Res.second->hasType(VT) || VT == MVT::Other)
+ return Res; // Correct type already, nothing to do.
+
+ // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
+ // return "eax". This should even work for things like getting 64bit integer
+ // registers when given an f64 type.
+ const TargetRegisterClass *Class = Res.second;
+ if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
+ Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
+ unsigned Size = VT.getSizeInBits();
+ if (Size == 1) Size = 8;
+ unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+ if (DestReg > 0) {
+ Res.first = DestReg;
+ Res.second = Size == 8 ? &X86::GR8RegClass
+ : Size == 16 ? &X86::GR16RegClass
+ : Size == 32 ? &X86::GR32RegClass
+ : &X86::GR64RegClass;
+ assert(Res.second->contains(Res.first) && "Register in register class");
+ } else {
+ // No register found/type mismatch.
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass ||
+ Class == &X86::VR128RegClass || Class == &X86::VR256RegClass ||
+ Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass ||
+ Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass ||
+ Class == &X86::VR512RegClass) {
+ // Handle references to XMM physical registers that got mapped into the
+ // wrong class. This can happen with constraints like {xmm0} where the
+ // target independent register mapper will just pick the first match it can
+ // find, ignoring the required type.
+
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+ if (VT == MVT::f32 || VT == MVT::i32)
+ Res.second = &X86::FR32RegClass;
+ else if (VT == MVT::f64 || VT == MVT::i64)
+ Res.second = &X86::FR64RegClass;
+ else if (X86::VR128RegClass.hasType(VT))
+ Res.second = &X86::VR128RegClass;
+ else if (X86::VR256RegClass.hasType(VT))
+ Res.second = &X86::VR256RegClass;
+ else if (X86::VR512RegClass.hasType(VT))
+ Res.second = &X86::VR512RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ }
+
+ return Res;
+}
+
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // Scaling factors are not free at all.
+ // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+ // will take 2 allocations in the out of order engine instead of 1
+ // for plain addressing mode, i.e. inst (reg1).
+ // E.g.,
+ // vaddps (%rsi,%drx), %ymm0, %ymm1
+ // Requires two allocations (one for the load, one for the computation)
+ // whereas:
+ // vaddps (%rsi), %ymm0, %ymm1
+ // Requires just 1 allocation, i.e., freeing allocations for other operations
+ // and having less micro operations to execute.
+ //
+ // For some X86 architectures, this is even worse because for instance for
+ // stores, the complex addressing mode forces the instruction to use the
+ // "load" ports instead of the dedicated "store" port.
+ // E.g., on Haswell:
+ // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ if (isLegalAddressingMode(DL, AM, Ty, AS))
+ // Scale represents reg2 * scale, thus account for 1
+ // as soon as we use a second register.
+ return AM.Scale != 0;
+ return -1;
+}
+
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // Integer division on x86 is expensive. However, when aggressively optimizing
+ // for code size, we prefer to use a div instruction, as it is usually smaller
+ // than the alternative sequence.
+ // The exception to this is vector division. Since x86 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..8bb0e5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,1157 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+ class X86Subtarget;
+ class X86TargetMachine;
+
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// Bit scan forward.
+ BSF,
+ /// Bit scan reverse.
+ BSR,
+
+ /// Double shift instructions. These correspond to
+ /// X86::SHLDxx and X86::SHRDxx instructions.
+ SHLD,
+ SHRD,
+
+ /// Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
+ /// These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
+
+ /// This operation implements the lowering for readcyclecounter
+ RDTSC_DAG,
+
+ /// X86 Read Time-Stamp Counter and Processor ID.
+ RDTSCP_DAG,
+
+ /// X86 Read Performance Monitoring Counters.
+ RDPMC_DAG,
+
+ /// X86 compare and logical compare instructions.
+ CMP, COMI, UCOMI,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
+
+ /// X86 Select
+ SELECT,
+
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+ /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+ /// Operands are two FP values to compare; result is a mask of
+ /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
+ FSETCC,
+
+ /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+ /// result in an integer GPR. Needs masking for scalar result.
+ FGETSIGNx86,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction. It also writes a
+ /// flag result.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// Return from interrupt. Operand 0 is the number of bytes to pop.
+ IRET,
+
+ /// Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ Wrapper,
+
+ /// Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// Copies a 64-bit value from the low word of an XMM vector
+ /// to an MMX vector. If you think this is too close to the previous
+ /// mnemonic, so do I; blame Intel.
+ MOVDQ2Q,
+
+ /// Copies a 32-bit value from the low word of a MMX
+ /// vector to a GPR.
+ MMX_MOVD2W,
+
+ /// Copies a GPR into the low 32-bit word of a MMX vector
+ /// and zero out the high word.
+ MMX_MOVW2D,
+
+ /// Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW, MMX_PINSRW,
+
+ /// Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// Compute Sum of Absolute Differences.
+ PSADBW,
+ /// Compute Double Block Packed Sum-Absolute-Differences
+ DBPSADBW,
+
+ /// Bitwise Logical AND NOT of Packed FP values.
+ ANDNP,
+
+ /// Copy integer sign.
+ PSIGN,
+
+ /// Blend where the selector is an immediate.
+ BLENDI,
+
+ /// Blend where the condition has been shrunk.
+ /// This is used to emphasize that the condition mask is
+ /// no more valid for generic VSELECT optimizations.
+ SHRUNKBLEND,
+
+ /// Combined add and sub on an FP vector.
+ ADDSUB,
+
+ // FP vector ops with rounding mode.
+ FADD_RND,
+ FSUB_RND,
+ FMUL_RND,
+ FDIV_RND,
+ FMAX_RND,
+ FMIN_RND,
+ FSQRT_RND,
+
+ // FP vector get exponent
+ FGETEXP_RND,
+ // Extract Normalized Mantissas
+ VGETMANT,
+ // FP Scale
+ SCALEF,
+ // Integer add/sub with unsigned saturation.
+ ADDUS,
+ SUBUS,
+ // Integer add/sub with signed saturation.
+ ADDS,
+ SUBS,
+ // Unsigned Integer average
+ AVG,
+ /// Integer horizontal add.
+ HADD,
+
+ /// Integer horizontal sub.
+ HSUB,
+
+ /// Floating point horizontal add.
+ FHADD,
+
+ /// Floating point horizontal sub.
+ FHSUB,
+
+ // Integer absolute value
+ ABS,
+
+ // Detect Conflicts Within a Vector
+ CONFLICT,
+
+ /// Floating point max and min.
+ FMAX, FMIN,
+
+ /// Commutative FMIN and FMAX.
+ FMAXC, FMINC,
+
+ /// Floating point reciprocal-sqrt and reciprocal approximation.
+ /// Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT, FRCP,
+
+ // Thread Local Storage.
+ TLSADDR,
+
+ // Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
+ // Thread Local Storage. When calling to an OS provided
+ // thunk at the address from an earlier relocation.
+ TLSCALL,
+
+ // Exception Handling helpers.
+ EH_RETURN,
+
+ // SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ /// Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
+ TC_RETURN,
+
+ // Vector move to low scalar and zero higher vector elements.
+ VZEXT_MOVL,
+
+ // Vector integer zero-extend.
+ VZEXT,
+
+ // Vector integer signed-extend.
+ VSEXT,
+
+ // Vector integer truncate.
+ VTRUNC,
+ // Vector integer truncate with unsigned/signed saturation.
+ VTRUNCUS, VTRUNCS,
+
+ // Vector FP extend.
+ VFPEXT,
+
+ // Vector FP round.
+ VFPROUND,
+
+ // Vector signed/unsigned integer to double.
+ CVTDQ2PD, CVTUDQ2PD,
+
+ // Convert a vector to mask, set bits base on MSB.
+ CVT2MASK,
+
+ // 128-bit vector logical left / right shift
+ VSHLDQ, VSRLDQ,
+
+ // Vector shift elements
+ VSHL, VSRL, VSRA,
+
+ // Vector shift elements by immediate
+ VSHLI, VSRLI, VSRAI,
+
+ // Vector packed double/float comparison.
+ CMPP,
+
+ // Vector integer comparisons.
+ PCMPEQ, PCMPGT,
+ // Vector integer comparisons, the result is in a mask vector.
+ PCMPEQM, PCMPGTM,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ CMPM,
+ CMPMU,
+ // Vector comparison with rounding mode for FP values
+ CMPM_RND,
+
+ // Arithmetic operations with FLAGS results.
+ ADD, SUB, ADC, SBB, SMUL,
+ INC, DEC, OR, XOR, AND,
+
+ BEXTR, // Bit field extract
+
+ UMUL, // LOW, HI, FLAGS = umul LHS, RHS
+
+ // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS
+ SMUL8, UMUL8,
+
+ // 8-bit divrem that zero-extend the high result (AH).
+ UDIVREM8_ZEXT_HREG,
+ SDIVREM8_SEXT_HREG,
+
+ // X86-specific multiply by immediate.
+ MUL_IMM,
+
+ // Vector bitwise comparisons.
+ PTEST,
+
+ // Vector packed fp sign bitwise comparisons.
+ TESTP,
+
+ // Vector "test" in AVX-512, the result is in a mask vector.
+ TESTM,
+ TESTNM,
+
+ // OR/AND test for masks
+ KORTEST,
+ KTEST,
+
+ // Several flavors of instructions with vector shuffle behaviors.
+ PACKSS,
+ PACKUS,
+ // Intra-lane alignr
+ PALIGNR,
+ // AVX512 inter-lane alignr
+ VALIGN,
+ PSHUFD,
+ PSHUFHW,
+ PSHUFLW,
+ SHUFP,
+ //Shuffle Packed Values at 128-bit granularity
+ SHUF128,
+ MOVDDUP,
+ MOVSHDUP,
+ MOVSLDUP,
+ MOVLHPS,
+ MOVLHPD,
+ MOVHLPS,
+ MOVLPS,
+ MOVLPD,
+ MOVSD,
+ MOVSS,
+ UNPCKL,
+ UNPCKH,
+ VPERMILPV,
+ VPERMILPI,
+ VPERMV,
+ VPERMV3,
+ VPERMIV3,
+ VPERMI,
+ VPERM2X128,
+ // Bitwise ternary logic
+ VPTERNLOG,
+ // Fix Up Special Packed Float32/64 values
+ VFIXUPIMM,
+ // Range Restriction Calculation For Packed Pairs of Float32/64 values
+ VRANGE,
+ // Reduce - Perform Reduction Transformation on scalar\packed FP
+ VREDUCE,
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits
+ VRNDSCALE,
+ // VFPCLASS - Tests Types Of a FP Values for packed types.
+ VFPCLASS,
+ // VFPCLASSS - Tests Types Of a FP Values for scalar types.
+ VFPCLASSS,
+ // Broadcast scalar to vector
+ VBROADCAST,
+ // Broadcast mask to vector
+ VBROADCASTM,
+ // Broadcast subvector to vector
+ SUBV_BROADCAST,
+ // Insert/Extract vector element
+ VINSERT,
+ VEXTRACT,
+
+ /// SSE4A Extraction and Insertion.
+ EXTRQI, INSERTQI,
+
+ // XOP variable/immediate rotations
+ VPROT, VPROTI,
+ // XOP arithmetic/logical shifts
+ VPSHA, VPSHL,
+ // XOP signed/unsigned integer comparisons
+ VPCOM, VPCOMU,
+
+ // Vector multiply packed unsigned doubleword integers
+ PMULUDQ,
+ // Vector multiply packed signed doubleword integers
+ PMULDQ,
+ // Vector Multiply Packed UnsignedIntegers with Round and Scale
+ MULHRS,
+ // Multiply and Add Packed Integers
+ VPMADDUBSW, VPMADDWD,
+ // FMA nodes
+ FMADD,
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+ // FMA with rounding mode
+ FMADD_RND,
+ FNMADD_RND,
+ FMSUB_RND,
+ FNMSUB_RND,
+ FMADDSUB_RND,
+ FMSUBADD_RND,
+
+ // Compress and expand
+ COMPRESS,
+ EXPAND,
+
+ //Convert Unsigned/Integer to Scalar Floating-Point Value
+ //with rounding mode
+ SINT_TO_FP_RND,
+ UINT_TO_FP_RND,
+
+ // Vector float/double to signed/unsigned integer.
+ FP_TO_SINT_RND, FP_TO_UINT_RND,
+ // Save xmm argument registers to the stack, according to %al. An operator
+ // is needed so that this can be expanded with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // Windows's _chkstk call to do stack probing.
+ WIN_ALLOCA,
+
+ // For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+
+ // Memory barrier
+ MEMBARRIER,
+ MFENCE,
+ SFENCE,
+ LFENCE,
+
+ // Store FP status word into i16 register.
+ FNSTSW16r,
+
+ // Store contents of %ah into %eflags.
+ SAHF,
+
+ // Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
+ // Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
+ PCMPISTRI,
+ PCMPESTRI,
+
+ // Test if in transactional execution.
+ XTEST,
+
+ // ERI instructions
+ RSQRT28, RCP28, EXP2,
+
+ // Compare and swap.
+ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LCMPXCHG8_DAG,
+ LCMPXCHG16_DAG,
+
+ // Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ /// This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain).
+ FP_TO_INT16_IN_MEM,
+ FP_TO_INT32_IN_MEM,
+ FP_TO_INT64_IN_MEM,
+
+ /// This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has three inputs (token chain, address,
+ /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+ /// also produces a flag).
+ FILD,
+ FILD_FLAG,
+
+ /// This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, ptr to load from, and a ValueType node indicating the type
+ /// to load to.
+ FLD,
+
+ /// This instruction implements a truncating store to FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and a ValueType to store it
+ /// as.
+ FST,
+
+ /// This instruction grabs the address of the next argument
+ /// from a va_list. (reads and modifies the va_list in memory)
+ VAARG_64
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
+ // thought as target memory ops!
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+ bool isVEXTRACT128Index(SDNode *N);
+
+ /// Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+ bool isVINSERT128Index(SDNode *N);
+
+ /// Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+ bool isVEXTRACT256Index(SDNode *N);
+
+ /// Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+ bool isVINSERT256Index(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF128, VEXTRACTI128 instructions.
+ unsigned getExtractVEXTRACT128Immediate(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF128, VINSERT128 instructions.
+ unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+ unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+ /// Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF64x4, VINSERTI64x4 instructions.
+ unsigned getInsertVINSERT256Immediate(SDNode *N);
+
+ /// Returns true if Elt is a constant zero or floating point constant +0.0.
+ bool isZeroNode(SDValue Elt);
+
+ /// Returns true of the given offset can be
+ /// fit into displacement field of the instruction.
+ bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement = true);
+
+
+ /// Determines whether the callee is required to pop its
+ /// own arguments. Callee pop is necessary to support tail calls.
+ bool isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool TailCallOpt);
+
+ }
+
+ //===--------------------------------------------------------------------===//
+ // X86 Implementation of the TargetLowering interface
+ class X86TargetLowering final : public TargetLowering {
+ public:
+ explicit X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI);
+
+ unsigned getJumpTableEncoding() const override;
+ bool useSoftFloat() const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ return MVT::i8;
+ }
+
+ const MCExpr *
+ LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB, unsigned uid,
+ MCContext &Ctx) const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+ const MCExpr *
+ getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI, MCContext &Ctx) const override;
+
+ /// Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ unsigned getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const override;
+
+ /// Returns the target specific optimal type for load
+ /// and store operations as a result of memset, memcpy, and memmove
+ /// lowering. If DstAlign is zero that means it's safe to destination
+ /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+ /// means there isn't a need to check it against alignment requirement,
+ /// probably because the source does not need to be loaded. If 'IsMemset' is
+ /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+ /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+ /// source is constant so it does not need to be loaded.
+ /// It returns EVT::Other if the type should be determined using generic
+ /// target-independent logic.
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ /// Returns true if it's safe to use load / store of the
+ /// specified type to expand memcpy / memset inline. This is mostly true
+ /// for all types except for some special cases. For example, on X86
+ /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+ /// also does type conversion. Note the specified type doesn't have to be
+ /// legal as the hook is used before type legalization.
+ bool isSafeMemOpType(MVT VT) const override;
+
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type. Returns whether it is "fast" in the last argument.
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+ bool *Fast) const override;
+
+ /// Provide custom lowering hooks for some operations.
+ ///
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ /// Return true if the target has native support for
+ /// the specified value type and it is 'desirable' to use the type for the
+ /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+ /// instruction encodings are longer and some i16 instructions are slow.
+ bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+ /// Return true if the target has native support for the
+ /// specified value type and it is 'desirable' to use the type. e.g. On x86
+ /// i16 is legal, but undesirable since i16 instruction encodings are longer
+ /// and some i16 instructions are slow.
+ bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+ /// Return true if the MachineFunction contains a COPY which would imply
+ /// HasOpaqueSPAdjustment.
+ bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const override;
+
+
+ /// This method returns the name of a target specific DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ bool isCheapToSpeculateCttz() const override;
+
+ bool isCheapToSpeculateCtlz() const override;
+
+ /// Return the value type to use for ISD::SETCC.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ /// Determine which of the bits specified in Mask are known to be either
+ /// zero or one and return them in the KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ /// Determine the number of bits in the operation that are sign bits.
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
+ bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+ int64_t &Offset) const override;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ bool ExpandInlineAsm(CallInst *CI) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+ /// Lower the specified operand into the Ops vector. If it is invalid, don't
+ /// add anything to Ops. If hasMemory is true it means one of the asm
+ /// constraint of the inline asm instruction being processed is 'm'.
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "i")
+ return InlineAsm::Constraint_i;
+ else if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ else if (ConstraintCode == "v")
+ return InlineAsm::Constraint_v;
+ else if (ConstraintCode == "X")
+ return InlineAsm::Constraint_X;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ /// Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS) const override;
+
+ /// Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ /// Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalAddImmediate(int64_t Imm) const override;
+
+ /// \brief Return the cost of the scaling factor used in the addressing
+ /// mode represented by AM for this target, for a load/store
+ /// of the specified type.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, it returns a negative value.
+ int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ bool isVectorShiftByScalarCheap(Type *Ty) const override;
+
+ /// Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+ /// Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+ /// extend node) is profitable.
+ bool isVectorLoadExtDesirable(SDValue) const override;
+
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+ /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+ /// Return true if it's profitable to narrow
+ /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+ /// from i32 to i8 but not from i32 to i16.
+ bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+ /// Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ /// Targets can use this to indicate that they only support *some*
+ /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+ /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+ /// be legal.
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const override;
+
+ /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
+ /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
+ /// replace a VAND with a constant pool entry.
+ bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const override;
+
+ /// If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ bool ShouldShrinkFPConstant(EVT VT) const override {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !X86ScalarSSEf64 || VT == MVT::f80;
+ }
+
+ /// Return true if we believe it is correct and profitable to reduce the
+ /// load node to a smaller type.
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+ EVT NewVT) const override;
+
+ /// Return true if the specified scalar FP type is computed in an SSE
+ /// register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+ /// with this index.
+ bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
+ /// Intel processors have a unified instruction and data cache
+ const char * getClearCacheBuiltinName() const override {
+ return nullptr; // nothing to do, move along.
+ }
+
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+ /// This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ /// Return true if the target stores stack protector cookies at a fixed
+ /// offset in some non-standard address space, and populates the address
+ /// space and offset as appropriate.
+ bool getStackCookieLocation(unsigned &AddressSpace,
+ unsigned &Offset) const override;
+
+ /// Return true if the target stores SafeStack pointer at a fixed offset in
+ /// some non-standard address space, and populates the address space and
+ /// offset as appropriate.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+ SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+ SelectionDAG &DAG) const;
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+ bool useLoadStackGuardNode() const override;
+ /// \brief Customize the preferred legalization strategy for certain types.
+ LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
+ bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+ protected:
+ std::pair<const TargetRegisterClass *, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const override;
+
+ private:
+ /// Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// Select between SSE or x87 floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf32;
+ bool X86ScalarSSEf64;
+
+ /// A list of legal FP immediates.
+ std::vector<APFloat> LegalFPImmediates;
+
+ /// Indicate that this x86 target can instruction
+ /// select the specified FP immediate natively.
+ void addLegalFPImmediate(const APFloat& Imm) {
+ LegalFPImmediates.push_back(Imm);
+ }
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerMemArgument(SDValue Chain,
+ CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ SDLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo *MFI,
+ unsigned i) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ SDLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const;
+
+ // Call lowering helpers.
+
+ /// Check whether the call is eligible for tail call optimization. Targets
+ /// that want to do tail call optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall, bool Is64Bit,
+ int FPDiff, SDLoc dl) const;
+
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool isSigned,
+ bool isReplace) const;
+
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+ int64_t Offset, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToBT(SDValue And, ISD::CondCode CC,
+ SDLoc dl, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc dl, SelectionDAG &DAG) const override;
+
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+ EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const override;
+
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ LoadInst *
+ lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
+ bool needsCmpXchgNb(Type *MemType) const;
+
+ // Utility function to emit the low-level va_arg code for X86-64.
+ MachineBasicBlock *EmitVAARG64WithCustomInserter(
+ MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Utility function to emit the xmm reg save portion of va_start.
+ MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitFMA3Instr(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Emit nodes that will be selected as "test Op0,Op0", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+ SelectionDAG &DAG) const;
+
+ /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
+ SelectionDAG &DAG) const;
+
+ /// Convert a comparison if required by the subtarget.
+ SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+ /// Use rsqrt* to speed up sqrt calculations.
+ SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const override;
+
+ /// Use rcp* to speed up fdiv calculations.
+ SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const override;
+
+ /// Reassociate floating point divisions into multiply by reciprocal.
+ unsigned combineRepeatedFPDivisors() const override;
+ };
+
+ namespace X86 {
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+ }
+}
+
+#endif // X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 0000000..ba1aede
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+ : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+ Has3DNow0F0FOpcode {
+ // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+ let isAsmParserOnly = 1;
+ let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+ Has3DNow0F0FOpcode {
+ // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+ let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn))
+ (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+ [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+ "prefetch\t$addr",
+ [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+ [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+ Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 0000000..0a27c33
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,7519 @@
+//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 AVX512 instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+// Group template arguments that can be derived from the vector type (EltNum x
+// EltVT). These are things like the register class for the writemask, etc.
+// The idea is to pass one of these as the template argument rather than the
+// individual arguments.
+// The template is also used for scalar types, in this case numelts is 1.
+class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
+ string suffix = ""> {
+ RegisterClass RC = rc;
+ ValueType EltVT = eltvt;
+ int NumElts = numelts;
+
+ // Corresponding mask register class.
+ RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+
+ // Corresponding write-mask register class.
+ RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
+
+ // The GPR register class that can hold the write mask. Use GR8 for fewer
+ // than 8 elements. Use shift-right and equal to work around the lack of
+ // !lt in tablegen.
+ RegisterClass MRC =
+ !cast<RegisterClass>("GR" #
+ !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
+
+ // Suffix used in the instruction mnemonic.
+ string Suffix = suffix;
+
+ // VTName is a string name for vector VT. For vector types it will be
+ // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
+ // It is a little bit complex for scalar types, where NumElts = 1.
+ // In this case we build v4f32 or v2f64
+ string VTName = "v" # !if (!eq (NumElts, 1),
+ !if (!eq (EltVT.Size, 32), 4,
+ !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+
+ // The vector VT.
+ ValueType VT = !cast<ValueType>(VTName);
+
+ string EltTypeName = !cast<string>(EltVT);
+ // Size of the element type in bits, e.g. 32 for v16i32.
+ string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
+ int EltSize = EltVT.Size;
+
+ // "i" for integer types and "f" for floating-point types
+ string TypeVariantName = !subst(EltSizeName, "", EltTypeName);
+
+ // Size of RC in bits, e.g. 512 for VR512.
+ int Size = VT.Size;
+
+ // The corresponding memory operand, e.g. i512mem for VR512.
+ X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
+ X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+
+ // Load patterns
+ // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
+ // due to load promotion during legalization
+ PatFrag LdFrag = !cast<PatFrag>("load" #
+ !if (!eq (TypeVariantName, "i"),
+ !if (!eq (Size, 128), "v2i64",
+ !if (!eq (Size, 256), "v4i64",
+ VTName)), VTName));
+
+ PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
+ !if (!eq (TypeVariantName, "i"),
+ !if (!eq (Size, 128), "v2i64",
+ !if (!eq (Size, 256), "v4i64",
+ !if (!eq (Size, 512),
+ !if (!eq (EltSize, 64), "v8i64", "v16i32"),
+ VTName))), VTName));
+
+ PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+
+ // The corresponding float type, e.g. v16f32 for v16i32
+ // Note: For EltSize < 32, FloatVT is illegal and TableGen
+ // fails to compile, so we choose FloatVT = VT
+ ValueType FloatVT = !cast<ValueType>(
+ !if (!eq (!srl(EltSize,5),0),
+ VTName,
+ !if (!eq(TypeVariantName, "i"),
+ "v" # NumElts # "f" # EltSize,
+ VTName)));
+
+ // The string to specify embedded broadcast in assembly.
+ string BroadcastStr = "{1to" # NumElts # "}";
+
+ // 8-bit compressed displacement tuple/subvector format. This is only
+ // defined for NumElts <= 8.
+ CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
+ !cast<CD8VForm>("CD8VT" # NumElts), ?);
+
+ SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
+ !if (!eq (Size, 256), sub_ymm, ?));
+
+ Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
+ !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
+ SSEPackedInt));
+
+ RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+
+ // A vector type of the same width with element type i32. This is used to
+ // create the canonical constant zero node ImmAllZerosV.
+ ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
+ dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+
+ string ZSuffix = !if (!eq (Size, 128), "Z128",
+ !if (!eq (Size, 256), "Z256", "Z"));
+}
+
+def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">;
+def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
+def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
+def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">;
+def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
+def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">;
+
+// "x" in v32i8x_info means RC = VR256X
+def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">;
+def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
+def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">;
+def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">;
+def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">;
+def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">;
+
+def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">;
+def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;
+def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;
+def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;
+def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;
+def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;
+
+// We map scalar types to the smallest (128-bit) vector type
+// with the appropriate element type. This allows to use the same masking logic.
+def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">;
+def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">;
+def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">;
+def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">;
+
+class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
+ X86VectorVTInfo i128> {
+ X86VectorVTInfo info512 = i512;
+ X86VectorVTInfo info256 = i256;
+ X86VectorVTInfo info128 = i128;
+}
+
+def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
+ v16i8x_info>;
+def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
+ v8i16x_info>;
+def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
+ v4i32x_info>;
+def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
+ v2i64x_info>;
+def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
+ v4f32x_info>;
+def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
+ v2f64x_info>;
+
+// This multiclass generates the masking variants from the non-masking
+// variant. It only provides the assembly pieces for the masking variants.
+// It assumes custom ISel patterns for masking which can be provided as
+// template arguments.
+multiclass AVX512_maskable_custom<bits<8> O, Format F,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern,
+ list<dag> MaskingPattern,
+ list<dag> ZeroMaskingPattern,
+ string MaskingConstraint = "",
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def NAME: AVX512<O, F, Outs, Ins,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+ "$dst , "#IntelSrcAsm#"}",
+ Pattern, itin>;
+
+ // Prefer over VMOV*rrk Pat<>
+ let AddedComplexity = 20 in
+ def NAME#k: AVX512<O, F, Outs, MaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#"}",
+ MaskingPattern, itin>,
+ EVEX_K {
+ // In case of the 3src subclass this is overridden with a let.
+ string Constraints = MaskingConstraint;
+ }
+ let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+ def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
+ ZeroMaskingPattern,
+ itin>,
+ EVEX_KZ;
+}
+
+
+// Common base class of AVX512_maskable and AVX512_maskable_3src.
+multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS,
+ SDNode Select = vselect,
+ string MaskingConstraint = "",
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0> :
+ AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst, MaskingRHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+ MaskingConstraint, NoItinerary, IsCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0> :
+ AVX512_maskable_common<O, F, _, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
+ "$src0 = $dst", itin, IsCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the scalar instruction.
+multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0> :
+ AVX512_maskable_common<O, F, _, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
+ "$src0 = $dst", itin, IsCommutable>;
+
+// Similar to AVX512_maskable but in this case one of the source operands
+// ($src1) is already tied to $dst so we just use that for the preserved
+// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude
+// $src1.
+multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS> :
+ AVX512_maskable_common<O, F, _, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
+
+// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+ X86VectorVTInfo InVT,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS> :
+ AVX512_maskable_common<O, F, OutVT, Outs,
+ !con((ins InVT.RC:$src1), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (vselect InVT.KRCWM:$mask, RHS,
+ (bitconvert InVT.RC:$src1))>;
+
+multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS> :
+ AVX512_maskable_common<O, F, _, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (X86select _.KRCWM:$mask, RHS, _.RC:$src1)>;
+
+multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+ "$src0 = $dst">;
+
+
+// Instruction with mask that puts result in mask register,
+// like "compare" and "vptest"
+multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
+ dag Outs,
+ dag Ins, dag MaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern,
+ list<dag> MaskingPattern,
+ string Round = "",
+ InstrItinClass itin = NoItinerary> {
+ def NAME: AVX512<O, F, Outs, Ins,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
+ "$dst "#Round#", "#IntelSrcAsm#"}",
+ Pattern, itin>;
+
+ def NAME#k: AVX512<O, F, Outs, MaskingIns,
+ OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#Round#"}",
+ MaskingPattern, itin>, EVEX_K;
+}
+
+multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS,
+ string Round = "",
+ InstrItinClass itin = NoItinerary> :
+ AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.KRC:$dst, RHS)],
+ [(set _.KRC:$dst, MaskingRHS)],
+ Round, NoItinerary>;
+
+multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, string Round = "",
+ InstrItinClass itin = NoItinerary> :
+ AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (and _.KRCWM:$mask, RHS),
+ Round, itin>;
+
+multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm> :
+ AVX512_maskable_custom_cmp<O, F, Outs,
+ Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [],[],"", NoItinerary>;
+
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
+ def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
+ def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>;
+ def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>;
+ def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+ def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+ def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
+ def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
+ def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
+ def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>;
+ def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+ def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>;
+}
+
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+ [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vinsert_insert> {
+ let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
+
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+ }
+}
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vinsert_insert,
+ SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rm")
+ To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+ }
+}
+
+multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256> {
+
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vinsert128_insert>, EVEX_V256;
+
+ defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ vinsert128_insert>, EVEX_V512;
+
+ defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ vinsert256_insert>, VEX_W, EVEX_V512;
+
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vinsert128_insert>, VEX_W, EVEX_V256;
+
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ vinsert128_insert>, VEX_W, EVEX_V512;
+
+ defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ vinsert256_insert>, EVEX_V512;
+ }
+}
+
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
+
+// Codegen pattern with the alternative types,
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+// vinsertps - insert f32 to XMM
+def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
+ "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ EVEX_4V;
+def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+ (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
+ "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insertps VR128X:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+
+multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From,
+ X86VectorVTInfo To> {
+ // A subvector extract from the first vector position is
+ // a subregister copy that needs no instruction.
+ def NAME # To.NumElts:
+ Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))),
+ (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>;
+}
+
+multiclass vextract_for_size<int Opcode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vextract_extract> :
+ vextract_for_size_first_position_lowering<From, To> {
+
+ let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
+ // vextract_extract), we interesting only in patterns without mask,
+ // intrinsics pattern match generated bellow.
+ defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+ (ins From.RC:$src1, i32u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts,
+ "$idx, $src1", "$src1, $idx",
+ [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
+ (iPTR imm)))]>,
+ AVX512AIi8Base, EVEX;
+ let mayStore = 1 in {
+ def rm : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX;
+
+ def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, To.KRCWM:$mask,
+ From.RC:$src1, i32u8imm:$src2),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$src2, $src1, $dst {${mask}}|"
+ "$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K, EVEX;
+ }//mayStore = 1
+ }
+
+ // Intrinsic call with masking.
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrk")
+ To.RC:$src0,
+ (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+ From.RC:$src1, imm:$idx)>;
+
+ // Intrinsic call with zero-masking.
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrkz")
+ (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+ From.RC:$src1, imm:$idx)>;
+
+ // Intrinsic call without masking.
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rr")
+ From.RC:$src1, imm:$idx)>;
+}
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> :
+ vextract_for_size_first_position_lowering<From, To> {
+
+ let Predicates = p in
+ def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+}
+
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256> {
+ defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+ defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vextract256_extract>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ vextract128_extract>,
+ VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ vextract128_extract>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+ defm NAME # "32x8Z" : vextract_for_size<Opcode256,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vextract256_extract>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+ }
+}
+
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
+
+// extract_subvector codegen patterns with the alternative types.
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+
+def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+ (ins VR128X:$src1, u8imm:$src2),
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ EVEX;
+
+def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+ addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+ defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
+ (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
+ T8PD, EVEX;
+ let mayLoad = 1 in
+ defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (DestInfo.VT (X86VBroadcast
+ (SrcInfo.ScalarLdFrag addr:$src)))>,
+ T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
+}
+
+multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ EVEX_V256;
+ }
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, "vbroadcastss",
+ avx512vl_f32_info>;
+ let Predicates = [HasVLX] in {
+ defm VBROADCASTSSZ128 : avx512_broadcast_rm<0x18, "vbroadcastss",
+ v4f32x_info, v4f32x_info>, EVEX_V128;
+ }
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, "vbroadcastsd",
+ avx512vl_f64_info>, VEX_W;
+}
+
+// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument.
+// Later, we can canonize broadcast instructions before ISel phase and
+// eliminate additional patterns on ISel.
+// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar
+// representations of source
+multiclass avx512_broadcast_pat<string InstName, SDNode OpNode,
+ X86VectorVTInfo _, RegisterClass SrcRC_v,
+ RegisterClass SrcRC_s> {
+ def : Pat<(_.VT (OpNode (_.EltVT SrcRC_s:$src))),
+ (!cast<Instruction>(InstName##"r")
+ (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+
+ let AddedComplexity = 30 in {
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)),
+ (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask,
+ (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+
+ def : Pat<(_.VT(vselect _.KRCWM:$mask,
+ (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)),
+ (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask,
+ (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+ }
+}
+
+defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info,
+ VR128X, FR32X>;
+defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info,
+ VR128X, FR64X>;
+
+let Predicates = [HasVLX] in {
+ defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast,
+ v8f32x_info, VR128X, FR32X>;
+ defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast,
+ v4f32x_info, VR128X, FR32X>;
+ defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast,
+ v4f64x_info, VR128X, FR64X>;
+}
+
+def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSSZm addr:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VBROADCASTSDZm addr:$src)>;
+
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+ (VBROADCASTSSZm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+ (VBROADCASTSDZm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+ RegisterClass SrcRC> {
+ defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins SrcRC:$src), "vpbroadcast"##_.Suffix,
+ "$src", "$src", []>, T8PD, EVEX;
+}
+
+multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+ RegisterClass SrcRC, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
+ defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+ }
+}
+
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32,
+ HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32,
+ HasBWI>;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
+ HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
+ HasAVX512>, VEX_W;
+
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+ (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+ (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
+ (VPBROADCASTDrZr GR32:$src)>;
+def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
+ (VPBROADCASTQrZr GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
+ (VPBROADCASTDrZr GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
+ (VPBROADCASTQrZr GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src),
+ (v16i32 immAllZerosV), (i16 GR16:$mask))),
+ (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
+ (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
+ (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
+
+// Provide aliases for broadcast from the same register class that
+// automatically does the extract.
+multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo> {
+ def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
+ (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
+ (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+}
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+ EVEX_V512;
+ // Defined separately to avoid redefinition.
+ defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+ avx512vl_i8_info, HasBWI>;
+defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+ avx512vl_i16_info, HasBWI>;
+defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+ avx512vl_i32_info, HasAVX512>;
+defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+ avx512vl_i64_info, HasAVX512>, VEX_W;
+
+multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (X86SubVBroadcast
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+ AVX5128IBase, EVEX;
+}
+
+defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ v16i32_info, v4i32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+ v16f32_info, v4f32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+ v8i64_info, v4i64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
+ v8f64_info, v4f64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
+let Predicates = [HasVLX] in {
+defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ v8i32x_info, v4i32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+ v8f32x_info, v4f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+}
+let Predicates = [HasVLX, HasDQI] in {
+defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+ v4i64x_info, v2i64x_info>, VEX_W,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+ v4f64x_info, v2f64x_info>, VEX_W,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>;
+}
+let Predicates = [HasDQI] in {
+defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+ v8i64_info, v2i64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8",
+ v16i32_info, v8i32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+ v8f64_info, v2f64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
+ v16f32_info, v8f32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+}
+
+multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Src,
+ SDNode OpNode = X86SubVBroadcast> {
+
+ defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>,
+ T8PD, EVEX;
+ let mayLoad = 1 in
+ defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (OpNode
+ (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>,
+ T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasDQI] in
+ defm Z : avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>,
+ EVEX_V512;
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>,
+ EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> :
+ avx512_common_broadcast_32x2<opc, OpcodeStr, _> {
+
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128,
+ X86SubV32x2Broadcast>, EVEX_V128;
+}
+
+defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+ avx512vl_i32_info>;
+defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+ avx512vl_f32_info>;
+
+def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
+ (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
+ (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+
+def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
+ (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
+ (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
+ (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
+ (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass KRC> {
+ def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
+}
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+ let Predicates = [HasCDI] in
+ defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+ let Predicates = [HasCDI, HasVLX] in {
+ defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+ defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+ }
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
+ avx512vl_i32_info, VK16>;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
+ avx512vl_i64_info, VK8>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst" in {
+ defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+ AVX5128IBase;
+
+ let mayLoad = 1 in
+ defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2,
+ (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+ EVEX_4V, AVX5128IBase;
+ }
+}
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let mayLoad = 1, Constraints = "$src1 = $dst" in
+ defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermi2X IdxVT.RC:$src1,
+ _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+ AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx> {
+ let Predicates = [HasBWI] in
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d",
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q",
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W : avx512_perm_i_sizes_w<0x75, "vpermi2w",
+ avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst" in {
+ defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
+ AVX5128IBase;
+
+ let mayLoad = 1 in
+ defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+ (bitconvert (_.LdFrag addr:$src3))))>,
+ EVEX_4V, AVX5128IBase;
+ }
+}
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let mayLoad = 1, Constraints = "$src1 = $dst" in
+ defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermt2 _.RC:$src1,
+ IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+ AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx> {
+ let Predicates = [HasBWI] in
+ defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d",
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q",
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W : avx512_perm_t_sizes_w<0x7D, "vpermt2w",
+ avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"),
+ []>, EVEX_4V;
+ def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K;
+ def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_KZ;
+ let mayLoad = 1 in {
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+ EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+ def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
+ }
+ }
+}
+multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+ def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
+ EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+ def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+}
+
+multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>,
+ avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
+ avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
+ avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasBWI] in
+ defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasBWI, HasVLX] in {
+ defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+
+defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
+
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+ (v8f32 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+ (v8i32 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
+
+ defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc)>, EVEX_4V;
+ let mayLoad = 1 in
+ defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc,
+ (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs VK1:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+ defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">,
+ EVEX_4V, EVEX_B;
+ }// let isAsmParserOnly = 1, hasSideEffects = 0
+
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512Ii8<0xC2, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
+ !strconcat("vcmp${cc}", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ _.FRC:$src2,
+ imm:$cc))],
+ IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rm : AVX512Ii8<0xC2, MRMSrcMem,
+ (outs _.KRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+ !strconcat("vcmp${cc}", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+ }
+}
+
+let Predicates = [HasAVX512] in {
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
+ AVX512XSIi8Base;
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
+ AVX512XDIi8Base, VEX_W;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rrk : AVX512BI<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ let mayLoad = 1 in
+ def rmk : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert
+ (_.LdFrag addr:$src2))))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> :
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, _> {
+ let mayLoad = 1 in {
+ def rmb : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
+ "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ def rmbk : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+ }
+}
+
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
+ avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
+ avx512vl_i16_info, HasBWI>,
+ EVEX_CD8<16, CD8VF>;
+
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
+ avx512vl_i32_info, HasAVX512>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
+ avx512vl_i64_info, HasAVX512>,
+ T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+ avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+ avx512vl_i16_info, HasBWI>,
+ EVEX_CD8<16, CD8VF>;
+
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+ avx512vl_i32_info, HasAVX512>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+ avx512vl_i64_info, HasAVX512>,
+ T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+ (COPY_TO_REGCLASS (VPCMPGTDZrr
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+ (COPY_TO_REGCLASS (VPCMPEQDZrr
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
+ X86VectorVTInfo _> {
+ def rri : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rmi : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rrik : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+ AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)))],
+ IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ let mayLoad = 1 in
+ def rmik : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+ AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ imm:$cc)))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+ "$dst, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+ "$dst, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ let mayLoad = 1 in
+ def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+ }
+}
+
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
+ X86VectorVTInfo _> :
+ avx512_icmp_cc<opc, Suffix, OpNode, _> {
+ def rmib : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+ AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ def rmibk : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2, AVX512ICC:$cc),
+ !strconcat("vpcmp${cc}", Suffix,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ imm:$cc)))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
+ def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+ [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+ }
+}
+
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
+ HasBWI>, EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
+ HasBWI>, EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
+ HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
+ HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<X86VectorVTInfo _> {
+
+ defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (X86cmpm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc)>;
+
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (X86cmpm (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ imm:$cc)>;
+
+ defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (X86cmpm (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>,EVEX_B;
+ }
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+ let mayLoad = 1 in {
+ defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+ defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B;
+ }
+ }
+}
+
+multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
+ // comparison code form (VCMP[EQ/LT/LE/...]
+ defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+ (X86cmpmRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc,
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc,{sae}, $src2, $src1",
+ "$src1, $src2,{sae}, $cc">, EVEX_B;
+ }
+}
+
+multiclass avx512_vcmp<AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcmp_common<_.info512>,
+ avx512_vcmp_sae<_.info512>, EVEX_V512;
+
+ }
+ let Predicates = [HasAVX512,HasVLX] in {
+ defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128;
+ defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256;
+ }
+}
+
+defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
+ AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
+ AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS (VCMPPSZrri
+ (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+ (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS (VPCMPDZrri
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+ (COPY_TO_REGCLASS (VPCMPUDZrri
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
+
+// ----------------------------------------------------------------
+// FPClass
+//handle fpclass instruction mask = op(reg_scalar,imm)
+// op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ let mayLoad = 1, AddedComplexity = 20 in {
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##
+ "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##
+ "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ }
+ }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+// fpclass(reg_vec, mem_vec, imm)
+// fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string mem, string broadcast>{
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ let mayLoad = 1 in {
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##mem#
+ "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##mem#
+ "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ _.BroadcastStr##", $dst | $dst, ${src1}"
+ ##_.BroadcastStr##", $src2}",
+ [(set _.KRC:$dst,(OpNode
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+ def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"##
+ _.BroadcastStr##", $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>,
+ EVEX_B, EVEX_K;
+ }
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
+ string broadcast>{
+ let Predicates = [prd] in {
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
+ broadcast>, EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
+ broadcast>, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
+ broadcast>, EVEX_V256;
+ }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+ bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+ defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
+ VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
+ VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+ defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+ defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
+ X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+
+//-----------------------------------------------------------------
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+ string OpcodeStr, RegisterClass KRC,
+ ValueType vvt, X86MemOperand x86memop> {
+ let hasSideEffects = 0 in {
+ def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ let mayLoad = 1 in
+ def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (vvt (load addr:$src)))]>;
+ let mayStore = 1 in
+ def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store KRC:$src, addr:$dst)]>;
+ }
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+ string OpcodeStr,
+ RegisterClass KRC, RegisterClass GRC> {
+ let hasSideEffects = 0 in {
+ def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ }
+}
+
+let Predicates = [HasDQI] in
+ defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
+ avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
+ VEX, PD;
+
+let Predicates = [HasAVX512] in
+ defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+ avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+ VEX, PS;
+
+let Predicates = [HasBWI] in {
+ defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+ VEX, PD, VEX_W;
+ defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
+ VEX, XD;
+}
+
+let Predicates = [HasBWI] in {
+ defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+ VEX, PS, VEX_W;
+ defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
+ VEX, XD, VEX_W;
+}
+
+// GR from/to mask register
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>;
+ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>;
+}
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+ (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
+ def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+ (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>;
+ def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>;
+ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>;
+}
+
+// Load/store kreg
+let Predicates = [HasDQI] in {
+ def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+ (KMOVBmk addr:$dst, VK8:$src)>;
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (KMOVBkm addr:$src)>;
+
+ def : Pat<(store VK4:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
+ def : Pat<(store VK2:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+ def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+ (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+}
+let Predicates = [HasAVX512] in {
+ def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
+ (KMOVWmk addr:$dst, VK16:$src)>;
+ def : Pat<(i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0),
+ (MOV8rm addr:$src), sub_8bit)),
+ (i16 1)), VK1)>;
+ def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+ (KMOVWkm addr:$src)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
+ (KMOVDmk addr:$dst, VK32:$src)>;
+ def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+ (KMOVDkm addr:$src)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
+ (KMOVQmk addr:$dst, VK64:$src)>;
+ def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+ (KMOVQkm addr:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(i1 (trunc (i64 GR64:$src))),
+ (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit),
+ (i32 1))), VK1)>;
+
+ def : Pat<(i1 (trunc (i32 GR32:$src))),
+ (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>;
+
+ def : Pat<(i1 (trunc (i8 GR8:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
+ VK1)>;
+ def : Pat<(i1 (trunc (i16 GR16:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
+ VK1)>;
+
+ def : Pat<(i32 (zext VK1:$src)),
+ (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+ def : Pat<(i32 (anyext VK1:$src)),
+ (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
+ def : Pat<(i8 (zext VK1:$src)),
+ (EXTRACT_SUBREG
+ (AND32ri (KMOVWrk
+ (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+ def : Pat<(i8 (anyext VK1:$src)),
+ (EXTRACT_SUBREG
+ (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
+
+ def : Pat<(i64 (zext VK1:$src)),
+ (AND64ri8 (SUBREG_TO_REG (i64 0),
+ (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+ def : Pat<(i16 (zext VK1:$src)),
+ (EXTRACT_SUBREG
+ (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+ sub_16bit)>;
+}
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
+
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512, NoDQI] in {
+ // GR from/to 8-bit mask without native support
+ def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>;
+ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (EXTRACT_SUBREG
+ (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+ sub_8bit)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK16:$src, VK1)>;
+ def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK1)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK32:$src, VK1)>;
+ def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK64:$src, VK1)>;
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode,
+ Predicate prd> {
+ let Predicates = [prd] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode> {
+ defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ HasDQI>, VEX, PD;
+ defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ HasAVX512>, VEX, PS;
+ defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ HasBWI>, VEX, PD, VEX_W;
+ defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ HasBWI>, VEX, PS, VEX_W;
+}
+
+defm KNOT : avx512_mask_unop_all<0x44, "knot", not>;
+
+multiclass avx512_mask_unop_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+ (i16 GR16:$src)),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+ (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
+}
+defm : avx512_mask_unop_int<"knot", "KNOT">;
+
+let Predicates = [HasDQI] in
+def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>;
+let Predicates = [HasAVX512] in
+def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+let Predicates = [HasBWI] in
+def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>;
+let Predicates = [HasBWI] in
+def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>;
+
+// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
+let Predicates = [HasAVX512, NoDQI] in {
+def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
+def : Pat<(not VK8:$src),
+ (COPY_TO_REGCLASS
+ (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+}
+def : Pat<(xor VK4:$src1, (v4i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>;
+def : Pat<(xor VK2:$src1, (v2i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>;
+
+// Mask binary operation
+// - KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode,
+ Predicate prd, bit IsCommutable> {
+ let Predicates = [prd], isCommutable = IsCommutable in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode, bit IsCommutable,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+ defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+ defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+ defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+
+defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>;
+defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>;
+defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+ (i16 GR16:$src1), (i16 GR16:$src2)),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+ (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+ (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
+}
+
+defm : avx512_mask_binop_int<"kand", "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor", "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor", "KXOR">;
+
+multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+ // With AVX512F, 8-bit mask is promoted to 16-bit mask,
+ // for the DQI set, this type is legal and KxxxB instruction is used
+ let Predicates = [NoDQI] in
+ def : Pat<(OpNode VK8:$src1, VK8:$src2),
+ (COPY_TO_REGCLASS
+ (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+ (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+ // All types smaller than 8 bits require conversion anyway
+ def : Pat<(OpNode VK1:$src1, VK1:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK1:$src1, VK16),
+ (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+ def : Pat<(OpNode VK2:$src1, VK2:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK2:$src1, VK16),
+ (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
+ def : Pat<(OpNode VK4:$src1, VK4:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK4:$src1, VK16),
+ (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
+}
+
+defm : avx512_binop_pat<and, KANDWrr>;
+defm : avx512_binop_pat<andn, KANDNWrr>;
+defm : avx512_binop_pat<or, KORWrr>;
+defm : avx512_binop_pat<xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, KXORWrr>;
+
+def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)),
+ (KXNORWrr VK16:$src1, VK16:$src2)>;
+def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)),
+ (KXNORBrr VK8:$src1, VK8:$src2)>, Requires<[HasDQI]>;
+def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)),
+ (KXNORDrr VK32:$src1, VK32:$src2)>, Requires<[HasBWI]>;
+def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)),
+ (KXNORQrr VK64:$src1, VK64:$src2)>, Requires<[HasBWI]>;
+
+let Predicates = [NoDQI] in
+def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16),
+ (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16),
+ (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>;
+
+def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16),
+ (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>;
+
+def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)),
+ (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+ (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
+ RegisterClass KRCSrc, Predicate prd> {
+ let Predicates = [prd] in {
+ let hasSideEffects = 0 in
+ def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
+ (ins KRC:$src1, KRC:$src2),
+ "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_L;
+
+ def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
+ (!cast<Instruction>(NAME##rr)
+ (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
+ (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+ }
+}
+
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
+
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode, Predicate prd> {
+ let Predicates = [prd], Defs = [EFLAGS] in
+ def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+ VEX, PD;
+ defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+ VEX, PS;
+ defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+ VEX, PS, VEX_W;
+ defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+ VEX, PD, VEX_W;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode> {
+ let Predicates = [HasAVX512] in
+ def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
+ !strconcat(OpcodeStr,
+ "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+ SDNode OpNode> {
+ defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, TAPD, VEX_W;
+ let Predicates = [HasDQI] in
+ defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
+ VEX, TAPD;
+ let Predicates = [HasBWI] in {
+ defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
+ VEX, TAPD, VEX_W;
+ let Predicates = [HasDQI] in
+ defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
+ VEX, TAPD;
+ }
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+ let Predicates = [HasAVX512] in
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+ def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+ [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+ defm B : avx512_mask_setop<VK8, v8i1, Val>;
+ defm W : avx512_mask_setop<VK16, v16i1, Val>;
+ defm D : avx512_mask_setop<VK32, v32i1, Val>;
+ defm Q : avx512_mask_setop<VK64, v64i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+ def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
+ def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>;
+ def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>;
+ def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+ def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+ def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+}
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
+
+def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
+ (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+ (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
+ (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+ (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
+
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
+ (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
+
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+ (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
+
+def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+ (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
+
+def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+ (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
+
+def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
+def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
+
+def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>;
+
+def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>;
+
+
+def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
+ (v8i1 (COPY_TO_REGCLASS
+ (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16),
+ (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
+ (v8i1 (COPY_TO_REGCLASS
+ (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16),
+ (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))),
+ (v4i1 (COPY_TO_REGCLASS
+ (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16),
+ (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
+
+def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))),
+ (v4i1 (COPY_TO_REGCLASS
+ (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16),
+ (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+
+multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ PatFrag ld_frag, PatFrag mload,
+ bit IsReMaterializable = 1> {
+ let hasSideEffects = 0 in {
+ def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+ _.ExeDomain>, EVEX;
+ def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>,
+ EVEX, EVEX_KZ;
+
+ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable,
+ SchedRW = [WriteLoad] in
+ def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
+ _.ExeDomain>, EVEX;
+
+ let Constraints = "$src0 = $dst" in {
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ (_.VT _.RC:$src1),
+ (_.VT _.RC:$src0))))], _.ExeDomain>,
+ EVEX, EVEX_K;
+ let mayLoad = 1, SchedRW = [WriteLoad] in
+ def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT
+ (vselect _.KRCWM:$mask,
+ (_.VT (bitconvert (ld_frag addr:$src1))),
+ (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
+ }
+ let mayLoad = 1, SchedRW = [WriteLoad] in
+ def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src),
+ OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
+ "${dst} {${mask}} {z}, $src}",
+ [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+ _.ExeDomain>, EVEX, EVEX_KZ;
+ }
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
+ _.KRCWM:$mask, addr:$ptr)>;
+}
+
+multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _,
+ Predicate prd,
+ bit IsReMaterializable = 1> {
+ let Predicates = [prd] in
+ defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
+ masked_load_aligned512, IsReMaterializable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
+ masked_load_aligned256, IsReMaterializable>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
+ masked_load_aligned128, IsReMaterializable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _,
+ Predicate prd,
+ bit IsReMaterializable = 1> {
+ let Predicates = [prd] in
+ defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
+ masked_load_unaligned, IsReMaterializable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
+ masked_load_unaligned, IsReMaterializable>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
+ masked_load_unaligned, IsReMaterializable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ PatFrag st_frag, PatFrag mstore> {
+
+ def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+ OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
+ [], _.ExeDomain>, EVEX;
+ def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+ "${dst} {${mask}}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_K;
+ def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
+ "${dst} {${mask}} {z}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_KZ;
+
+ let mayStore = 1 in {
+ def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
+ def mrk : AVX512PI<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_K;
+ }
+
+ def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
+ (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
+ _.KRCWM:$mask, _.RC:$src)>;
+}
+
+
+multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
+ masked_store_unaligned>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
+ masked_store_unaligned>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
+ masked_store_unaligned>, EVEX_V128;
+ }
+}
+
+multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
+ masked_store_aligned512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
+ masked_store_aligned256>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
+ masked_store_aligned128>, EVEX_V128;
+ }
+}
+
+defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
+ HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
+ HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>,
+ avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>,
+ avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
+ (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+ (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
+ (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+ (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
+ (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+ (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
+ (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+ (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
+ (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+ (VMOVAPDZrm addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
+ (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+ (VMOVAPSZrm addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
+ GR16:$mask),
+ (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+ VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
+ GR8:$mask),
+ (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+ VR512:$src)>;
+
+def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src),
+ GR16:$mask),
+ (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+ VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
+ GR8:$mask),
+ (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+ VR512:$src)>;
+
+defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
+ HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
+ HasAVX512>,
+ avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
+ HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
+ avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+ HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
+ avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
+ HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>,
+ avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
+ HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>,
+ avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
+ HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
+ (v16i32 immAllZerosV), GR16:$mask)),
+ (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
+ (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+ (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
+ GR16:$mask),
+ (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+ VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
+ GR8:$mask),
+ (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+ VR512:$src)>;
+
+let AddedComplexity = 20 in {
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
+ (bc_v8i64 (v16i32 immAllZerosV)))),
+ (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>;
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+ (v8i64 VR512:$src))),
+ (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+ VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
+ (v16i32 immAllZerosV))),
+ (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+ (v16i32 VR512:$src))),
+ (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+}
+
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ EVEX;
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set FR64X:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64X:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+ EVEX_CD8<64, CD8VT1>;
+}
+
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, EVEX;
+
+def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move doubleword from xmm register to r/m32
+//
+def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+ EVEX;
+def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128X:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, EVEX_CD8<32, CD8VT1>;
+
+// Move quadword from xmm1 register to r/m64
+//
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ Requires<[HasAVX512, In64BitMode]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+ addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+ Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+let hasSideEffects = 0 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+ EVEX, VEX_W;
+
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+ (ins FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32X:$src))],
+ IIC_SSE_MOVD_ToGP>, EVEX;
+def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Quadword Int to Packed Quadword Int
+//
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar <string asm, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ asm, "$src2, $src1","$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2))),
+ IIC_SSE_MOV_S_RR>, EVEX_4V;
+ let Constraints = "$src1 = $dst" , mayLoad = 1 in
+ defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+ (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src),
+ asm,"$src","$src",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src)))))>, EVEX;
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.FRC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+ (scalar_to_vector _.FRC:$src2))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ }
+ let mayStore = 1 in {
+ def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
+ EVEX;
+ def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
+ } // mayStore
+}
+
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+ VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+ (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+
+def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+ (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+
+def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
+ (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+ (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+
+defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+ "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
+ XS, EVEX_4V, VEX_LIG;
+
+defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+ (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+ "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
+ XD, EVEX_4V, VEX_LIG, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+ // MOVS{S,D} to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+ (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+ (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+ (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+ (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4f32 (V_SET0)),
+ (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+ // MOVSDrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+ }
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+ FR32X:$src)), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+ FR64X:$src)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+ // Move low f64 and clear high bits.
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (v2f64 (V_SET0)),
+ (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+ (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+ // Extract and store.
+ def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+ def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
+
+ // Shuffle with VMOVSS
+ def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+ (VMOVSSZrr (v4i32 VR128X:$src1),
+ (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+ def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+ (VMOVSSZrr (v4f32 VR128X:$src1),
+ (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+ // 256-bit variants
+ def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // Shuffle with VMOVSD
+ def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+ // 256-bit variants
+ def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst, (v2i64 (X86vzmovl
+ (v2i64 VR128X:$src))))],
+ IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let AddedComplexity = 20 , isCodeGenOnly = 1 in
+def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i128mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W,
+ EVEX_CD8<8, CD8VT8>;
+
+let Predicates = [HasAVX512] in {
+ // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIZrr GR64:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIZrr GR32:$src)>;
+
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVZPQILo2PQIZrm addr:$src)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+ (VMOVZPQILo2PQIZrr VR128X:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVZPQILo2PQIZrm addr:$src)>;
+ }
+
+ // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+let SchedRW = [WriteLoad] in {
+ def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
+ SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+ EVEX_CD8<64, CD8VF>;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+ (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}", [],
+ SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+ EVEX_CD8<64, CD8VF>;
+
+ def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+ (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}", [],
+ SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+ EVEX_CD8<64, CD8VF>;
+ }
+}
+
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+ ValueType OpVT, RegisterClass RC, X86MemOperand memop,
+ Domain d, InstrItinClass itin = IIC_SSE_MOVNT> {
+ let SchedRW = [WriteStore], mayStore = 1,
+ AddedComplexity = 400 in
+ def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX;
+}
+
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+ string elty, string elsz, string vsz512,
+ string vsz256, string vsz128, Domain d,
+ Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> {
+ let Predicates = [prd] in
+ defm Z : avx512_movnt<opc, OpcodeStr, st_frag,
+ !cast<ValueType>("v"##vsz512##elty##elsz), VR512,
+ !cast<X86MemOperand>(elty##"512mem"), d, itin>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag,
+ !cast<ValueType>("v"##vsz256##elty##elsz), VR256X,
+ !cast<X86MemOperand>(elty##"256mem"), d, itin>,
+ EVEX_V256;
+
+ defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag,
+ !cast<ValueType>("v"##vsz128##elty##elsz), VR128X,
+ !cast<X86MemOperand>(elty##"128mem"), d, itin>,
+ EVEX_V128;
+ }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore,
+ "i", "64", "8", "4", "2", SSEPackedInt,
+ HasAVX512>, PD, EVEX_CD8<64, CD8VF>;
+
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore,
+ "f", "64", "8", "4", "2", SSEPackedDouble,
+ HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore,
+ "f", "32", "16", "8", "4", SSEPackedSingle,
+ HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> :
+ avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+ let mayLoad = 1 in
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1,
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ itins, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
+ itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+ bits<8> opc_d, bits<8> opc_q,
+ string OpcodeStr, SDNode OpNode,
+ OpndItins itins, bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+ itins, HasAVX512, IsCommutable>,
+ avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+ itins, HasBWI, IsCommutable>;
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
+ SDNode OpNode,X86VectorVTInfo _Src,
+ X86VectorVTInfo _Dst, bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2))),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (bitconvert (_Src.LdFrag addr:$src2)))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
+ OpcodeStr,
+ "${src2}"##_Dst.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_Dst.BroadcastStr,
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Dst.VT (X86VBroadcast
+ (_Dst.ScalarLdFrag addr:$src2)))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+ }
+}
+
+defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
+ SSE_INTALU_ITINS_P, 1>;
+defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
+ SSE_INTALU_ITINS_P, 0>;
+defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
+ SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
+ SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+ SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
+ HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+ HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
+ HasBWI, 1>, T8PD;
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
+ SDNode OpNode, bit IsCommutable = 0> {
+
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ v16i32_info, v8i64_info, IsCommutable>,
+ EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+ let Predicates = [HasVLX] in {
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ v8i32x_info, v4i64x_info, IsCommutable>,
+ EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+ v4i32x_info, v2i64x_info, IsCommutable>,
+ EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
+ }
+}
+
+defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
+ X86pmuldq, 1>,T8PD;
+defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
+ X86pmuludq, 1>;
+
+multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
+ let mayLoad = 1 in {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+ OpcodeStr,
+ "${src2}"##_Src.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_Src.BroadcastStr,
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Src.VT (X86VBroadcast
+ (_Src.ScalarLdFrag addr:$src2))))))>,
+ EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,X86VectorVTInfo _Src,
+ X86VectorVTInfo _Dst> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (bitconvert (_Src.LdFrag addr:$src2))))>,
+ EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
+ v32i16_info>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
+ v32i16_info>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
+ v16i16x_info>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
+ v16i16x_info>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
+ v8i16x_info>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
+ v8i16x_info>, EVEX_V128;
+ }
+}
+multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
+ v64i8_info>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
+ v32i8x_info>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
+ v16i8x_info>, EVEX_V128;
+ }
+}
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, AVX512VLVectorVTInfo _Src,
+ AVX512VLVectorVTInfo _Dst> {
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+ _Dst.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+ _Dst.info256>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+ _Dst.info128>, EVEX_V128;
+ }
+}
+
+let Predicates = [HasBWI] in {
+ defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD;
+ defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD;
+ defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W;
+ defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
+
+ defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+ avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+ defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+ avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
+}
+
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
+ SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+//===----------------------------------------------------------------------===//
+// AVX-512 Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+ SSE_INTALU_ITINS_P, HasAVX512, 0>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 FP arithmetic
+//===----------------------------------------------------------------------===//
+multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode, SDNode VecNode, OpndItins itins,
+ bit IsCommutable> {
+
+ defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT)),
+ itins.rr, IsCommutable>;
+
+ defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (VecNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT)),
+ itins.rm, IsCommutable>;
+ let isCodeGenOnly = 1, isCommutable = IsCommutable,
+ Predicates = [HasAVX512] in {
+ def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+ itins.rr>;
+ def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2)))], itins.rr>;
+ }
+}
+
+multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
+
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$rc)), itins.rr, IsCommutable>,
+ EVEX_B, EVEX_RC;
+}
+multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode VecNode, OpndItins itins, bit IsCommutable> {
+
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode VecNode,
+ SizeItins itins, bit IsCommutable> {
+ defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+ itins.s, IsCommutable>,
+ avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
+ itins.s, IsCommutable>,
+ XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+ itins.d, IsCommutable>,
+ avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
+ itins.d, IsCommutable>,
+ XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode VecNode,
+ SizeItins itins, bit IsCommutable> {
+ defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+ itins.s, IsCommutable>,
+ avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
+ itins.s, IsCommutable>,
+ XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+ itins.d, IsCommutable>,
+ avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
+ itins.d, IsCommutable>,
+ XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>;
+defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>;
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, bit IsCommutable> {
+ defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V;
+ defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))>,
+ EVEX_4V, EVEX_B;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ X86VectorVTInfo _> {
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+ EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ X86VectorVTInfo _> {
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit IsCommutable = 0> {
+ defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+ IsCommutable>, EVEX_V512, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+ IsCommutable>, EVEX_V512, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+ IsCommutable>, EVEX_V128, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+ IsCommutable>, EVEX_V256, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+ IsCommutable>, EVEX_V128, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+ IsCommutable>, EVEX_V256, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ }
+}
+
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+ defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+ defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
+ avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
+ avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
+ avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
+ avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>,
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>,
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
+let Predicates = [HasDQI] in {
+ defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, 1>;
+ defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>;
+ defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, 1>;
+ defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>;
+}
+
+multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
+ defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
+ EVEX_4V, EVEX_B;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> {
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>,
+ EVEX_4V,EVEX_CD8<32, CD8VT1>;
+ defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>,
+ EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+}
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_4V;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))))>,
+ EVEX_4V,
+ EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let mayLoad = 1 in
+ defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))>,
+ EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
+ avx512vl_i32_info>;
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
+ avx512vl_i64_info>, VEX_W;
+}
+
+multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in {
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>,
+ EVEX_V512, VEX_W;
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>,
+ EVEX_V512;
+ }
+ let Predicates = [HasVLX, HasBWI] in {
+
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>,
+ EVEX_V256, VEX_W;
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>,
+ EVEX_V128, VEX_W;
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>,
+ EVEX_V256;
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+ SDNode OpNode> :
+ avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>;
+
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;
+
+def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1),
+ (v16i32 VR512:$src2), (i16 -1))),
+ (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>;
+
+def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1),
+ (v8i64 VR512:$src2), (i8 -1))),
+ (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
+ SSE_INTSHIFT_ITINS_P.rr>;
+ let mayLoad = 1 in
+ defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i8 imm:$src2))),
+ SSE_INTSHIFT_ITINS_P.rm>;
+}
+
+multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ let mayLoad = 1 in
+ defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
+ (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
+ SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+ // src2 is always 128-bit
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
+ SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
+ SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
+ EVEX_4V;
+}
+
+multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType SrcVT, PatFrag bc_frag,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ VTInfo.info512>, EVEX_V512,
+ EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ VTInfo.info256>, EVEX_V256,
+ EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+ defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ VTInfo.info128>, EVEX_V128,
+ EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
+ string OpcodeStr, SDNode OpNode> {
+ defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
+ avx512vl_i32_info, HasAVX512>;
+ defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
+ avx512vl_i64_info, HasAVX512>, VEX_W;
+ defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16,
+ avx512vl_i16_info, HasBWI>;
+}
+
+multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>, EVEX_V256;
+ defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info128>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_shift_rmi_w<bits<8> opcw,
+ Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ v32i16_info>, EVEX_V512;
+ let Predicates = [HasVLX, HasBWI] in {
+ defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ v16i16x_info>, EVEX_V256;
+ defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ v8i16x_info>, EVEX_V128;
+ }
+}
+
+multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
+ Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode> {
+ defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
+ avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
+ avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
+ avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
+ avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
+
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
+ SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
+ EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let mayLoad = 1 in
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))),
+ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
+ avx512vl_i32_info>;
+ defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
+ avx512vl_i64_info>, VEX_W;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
+ let Predicates = [HasBWI, NoVLX] in {
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
+ (_.info256.VT _.info256.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME#"WZrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
+ (_.info128.VT _.info128.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME#"WZrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+ }
+}
+
+multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>,
+ EVEX_V512, VEX_W;
+ let Predicates = [HasVLX, HasBWI] in {
+
+ defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>,
+ EVEX_V256, VEX_W;
+ defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>,
+ EVEX_V128, VEX_W;
+ }
+}
+
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
+ avx512_var_shift_w<0x12, "vpsllvw", shl>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
+ avx512_var_shift_w<0x11, "vpsravw", sra>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
+ avx512_var_shift_w<0x10, "vpsrlvw", srl>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
+
+//===-------------------------------------------------------------------===//
+// 1-src variable permutation VPERMW/D/Q
+//===-------------------------------------------------------------------===//
+multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+}
+
+multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ VTInfo.info256>, EVEX_V256;
+}
+
+
+defm VPERM : avx512_var_shift_w<0x8D, "vpermw", X86VPermv>;
+
+defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
+ avx512vl_i32_info>;
+defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
+ avx512vl_i64_info>, VEX_W;
+defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
+ avx512vl_f32_info>;
+defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
+ avx512vl_f64_info>, VEX_W;
+
+defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
+ X86VPermi, avx512vl_i64_info>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
+ X86VPermi, avx512vl_f64_info>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+ defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (Ctrl.VT Ctrl.RC:$src2)))>,
+ T8PD, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (X86VBroadcast
+ (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+ Ctrl.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+ Ctrl.info128>, EVEX_V128;
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+ Ctrl.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+ defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+ X86VPermilpi, _>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
+
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+ avx512vl_i32_info>;
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+ avx512vl_i64_info>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
+//===----------------------------------------------------------------------===//
+
+defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
+ X86PShufd, avx512vl_i32_info>,
+ EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
+defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
+ X86PShufhw>, EVEX, AVX512XSIi8Base;
+defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
+ X86PShuflw>, EVEX, AVX512XDIi8Base;
+
+multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
+
+ let Predicates = [HasVLX, HasBWI] in {
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128;
+ }
+}
+
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+ (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+ (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+ (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let mayLoad = 1 in
+ def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1,
+ (_.VT (bitconvert
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+}
+
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ // VMOVHPS patterns
+ def : Pat<(X86Movlhps VR128X:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128X:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVHPD patterns
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVLPS patterns
+ def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+ (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+ (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVLPD patterns
+ def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+let mayStore = 1 in {
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
+ (bc_v2f64 (v4f32 VR128X:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128X:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+let Predicates = [HasAVX512] in {
+ // VMOVHPD patterns
+ def : Pat<(store (f64 (vector_extract
+ (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+ // VMOVLPS patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
+ (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+ // VMOVLPD patterns
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+}
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ AVX512FMA3Base;
+
+ let mayLoad = 1 in {
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (OpNode _.RC:$src1,
+ _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+ AVX512FMA3Base, EVEX_B;
+ }
+}
+
+multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+ avx512vl_f32_info>;
+ defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+ avx512vl_f64_info>, VEX_W;
+}
+
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
+ AVX512FMA3Base;
+
+ let mayLoad = 1 in {
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src2,
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
+ }
+}
+
+multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+ avx512vl_f32_info>;
+ defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+ avx512vl_f64_info>, VEX_W;
+}
+
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src3, _.RC:$src2),
+ OpcodeStr, "$src2, $src3", "$src3, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ AVX512FMA3Base;
+
+ let mayLoad = 1 in {
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src3, _.MemOp:$src2),
+ OpcodeStr, "$src2, $src3", "$src3, $src2",
+ (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>,
+ AVX512FMA3Base;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src3, _.ScalarMemOp:$src2),
+ OpcodeStr, "${src2}"##_.BroadcastStr##", $src3",
+ "$src3, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1,
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ _.RC:$src3))>, AVX512FMA3Base, EVEX_B;
+ }
+}
+
+multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src3, _.RC:$src2, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src2, $src3", "$src3, $src2, $rc",
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+ avx512vl_f32_info>;
+ defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+ avx512vl_f64_info>, VEX_W;
+}
+
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
+ dag RHS_r, dag RHS_m > {
+ defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base;
+
+ let mayLoad = 1 in
+ defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base;
+
+ defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC;
+
+ let isCodeGenOnly = 1 in {
+ def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [RHS_r]>;
+ let mayLoad = 1 in
+ def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [RHS_m]>;
+ }// isCodeGenOnly = 1
+}
+}// Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ ,
+ string SUFF> {
+
+ defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1,
+ (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))),
+ (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
+ (i32 imm:$rc))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+ _.FRC:$src3))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src3))))>;
+
+ defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)),
+ (_.VT (OpNode _.RC:$src2,
+ (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1)),
+ (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
+ (i32 imm:$rc))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
+ _.FRC:$src1))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
+
+ defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)),
+ (_.VT (OpNode _.RC:$src1,
+ (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src2)),
+ (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
+ (i32 imm:$rc))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
+ _.FRC:$src2))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+}
+
+multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{
+ let Predicates = [HasAVX512] in {
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnd, f32x_info, "SS">,
+ EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnd, f64x_info, "SD">,
+ EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ }
+}
+
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm> {
+ let hasSideEffects = 0 in {
+ def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
+ (ins DstVT.FRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
+ (ins DstVT.FRC:$src1, x86memop:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V;
+ } // hasSideEffects = 0
+ let isCodeGenOnly = 1 in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ SrcRC:$src2,
+ (i32 FROUND_CURRENT)))]>, EVEX_4V;
+
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, x86memop:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ (ld_frag addr:$src2),
+ (i32 FROUND_CURRENT)))]>, EVEX_4V;
+ }//isCodeGenOnly = 1
+}
+
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, string asm> {
+ def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
+ !strconcat(asm,
+ "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ SrcRC:$src2,
+ (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>,
+ avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>,
+ VEX_LIG;
+}
+
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+ v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
+ XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+ v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
+ XD, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+ (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+ (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+ (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+ (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
+ v4f32x_info, i32mem, loadi32,
+ "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
+ XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
+ i32mem, loadi32, "cvtusi2sd{l}">,
+ XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+ (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+ (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+ (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+ (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+ (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+ (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+ (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+ (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int,
+ Operand memop, ComplexPattern mem_cpat, string asm> {
+ let hasSideEffects = 0, Predicates = [HasAVX512] in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+ def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+ !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>,
+ EVEX, VEX_LIG, EVEX_B, EVEX_RC;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+ } // hasSideEffects = 0, Predicates = [HasAVX512]
+}
+
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64,
+ int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si">,
+ XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32,
+ int_x86_avx512_cvtss2usi,
+ ssmem, sse_load_f32, "cvtss2usi">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
+ int_x86_avx512_cvtss2usi64, ssmem,
+ sse_load_f32, "cvtss2usi">, XS, VEX_W,
+ EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+ sdmem, sse_load_f64, "cvtsd2si">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64,
+ int_x86_sse2_cvtsd2si64,
+ sdmem, sse_load_f64, "cvtsd2si">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32,
+ int_x86_avx512_cvtsd2usi,
+ sdmem, sse_load_f64, "cvtsd2usi">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
+ int_x86_avx512_cvtsd2usi64, sdmem,
+ sse_load_f64, "cvtsd2usi">, XD, VEX_W,
+ EVEX_CD8<64, CD8VT1>;
+
+let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
+ defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+ SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+ defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+ SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+ defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+ SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+ defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+ SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+ defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+ int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
+ SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+} // isCodeGenOnly = 1, Predicates = [HasAVX512]
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
+ X86VectorVTInfo _DstRC, SDNode OpNode,
+ SDNode OpNodeRnd>{
+let Predicates = [HasAVX512] in {
+ def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+ def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ []>, EVEX, EVEX_B;
+ def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+ EVEX;
+
+ let isCodeGenOnly = 1,hasSideEffects = 0 in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+ (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+ def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+ (i32 FROUND_NO_EXC)))]>,
+ EVEX,VEX_LIG , EVEX_B;
+ let mayLoad = 1 in
+ def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+ (ins _SrcRC.MemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ []>, EVEX, VEX_LIG;
+
+ } // isCodeGenOnly = 1, hasSideEffects = 0
+} //HasAVX512
+}
+
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info,
+ fp_to_sint,X86cvttss2IntRnd>,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info,
+ fp_to_sint,X86cvttss2IntRnd>,
+ VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info,
+ fp_to_sint,X86cvttsd2IntRnd>,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info,
+ fp_to_sint,X86cvttsd2IntRnd>,
+ VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info,
+ fp_to_uint,X86cvttss2UIntRnd>,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info,
+ fp_to_uint,X86cvttss2UIntRnd>,
+ XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint,X86cvttsd2UIntRnd>,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint,X86cvttsd2UIntRnd>,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
+ (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
+ (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
+ (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+ def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
+ (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+
+} // HasAVX512
+//===----------------------------------------------------------------------===//
+// AVX-512 Convert form float to double and back
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode> {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (_Src.VT (scalar_to_vector
+ (_Src.ScalarLdFrag addr:$src2)))))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Scalar Coversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2),
+ (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, VEX_LIG, EVEX_B;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+ EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, X86VectorVTInfo _src,
+ X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+ avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+ OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>,
+ EVEX_V512, XD;
+ }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, X86VectorVTInfo _src,
+ X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ EVEX_CD8<32, CD8VT1>, XS, EVEX_V512;
+ }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround,
+ X86froundRnd, f64x_info, f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext,
+ X86fpextRnd,f32x_info, f64x_info >;
+
+def : Pat<(f64 (fextend FR32X:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+ Requires<[HasAVX512]>;
+def : Pat<(f64 (fextend (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+ Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fround FR64X:$src)),
+ (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+ Requires<[HasAVX512]>;
+//===----------------------------------------------------------------------===//
+// AVX-512 Vector convert from signed/unsigned integer to float/double
+// and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode,
+ string Broadcast = _.BroadcastStr,
+ string Alias = ""> {
+
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+ (_.VT (OpNode (_Src.VT
+ (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr,
+ "${src}"##Broadcast, "${src}"##Broadcast,
+ (_.VT (OpNode (_Src.VT
+ (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+ ))>, EVEX, EVEX_B;
+}
+// Coversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr,
+ "{sae}, $src", "$src, {sae}",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
+ (i32 FROUND_NO_EXC)))>,
+ EVEX, EVEX_B;
+}
+
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src", "$src, $rc",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC;
+}
+
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ X86vfpextRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+ X86vfpext, "{1to2}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>,
+ EVEX_V256;
+ }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+ X86vfproundRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+ X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround,
+ "{1to4}", "{y}">, EVEX_V256;
+ }
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+ VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+ PS, EVEX_CD8<32, CD8VH>;
+
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+ (VCVTPS2PDZrm addr:$src)>;
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+ (VCVTPS2PDZ256rm addr:$src)>;
+}
+
+// Convert Signed/Unsigned Doubleword to Double
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128> {
+ // No rounding in this op
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+ OpNode128, "{1to2}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+ OpNodeRnd>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+ "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+ "{1to4}", "{y}">, EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+ "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+ "{1to4}", "{y}">, EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // Explicitly specified broadcast string, since we take only 2 elements
+ // from v4f32x_info source
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+ "{1to2}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // Explicitly specified broadcast string, since we take only 2 elements
+ // from v4f32x_info source
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+ "{1to2}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+ EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+ OpNodeRnd>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode,
+ "{1to2}", "{x}">, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+ "{1to4}", "{y}">, EVEX_V256;
+ }
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS,
+ EVEX_CD8<32, CD8VH>;
+
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+ X86VSintToFpRnd>,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
+ X86VFpToSintRnd>,
+ XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
+ X86VFpToSintRnd>,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
+ X86VFpToUintRnd>, PS,
+ EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+ X86VFpToUintRnd>, PS, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>,
+ XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+ X86VUintToFpRnd>, XD,
+ EVEX_CD8<32, CD8VF>;
+
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int,
+ X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int,
+ X86cvtpd2IntRnd>, XD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt,
+ X86cvtps2UIntRnd>,
+ PS, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt,
+ X86cvtpd2UIntRnd>, VEX_W,
+ PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int,
+ X86cvtpd2IntRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int,
+ X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt,
+ X86cvtpd2UIntRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt,
+ X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
+ X86VFpToSlongRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint,
+ X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
+ X86VFpToUlongRnd>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint,
+ X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+ X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+ X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
+ X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
+ X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+ (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+ (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+ (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+ (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+ (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+ (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+ (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+ (VCVTPD2PSZrm addr:$src)>;
+ def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+ (VCVTPS2PDZrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop, PatFrag ld_frag> {
+ defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+ "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT _src.RC:$src),
+ (i32 FROUND_CURRENT))>, T8PD;
+ let hasSideEffects = 0, mayLoad = 1 in {
+ defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+ (i32 FROUND_CURRENT))>, T8PD;
+ }
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+ defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+ "vcvtph2ps", "{sae}, $src", "$src, {sae}",
+ (X86cvtph2ps (_src.VT _src.RC:$src),
+ (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+
+}
+
+let Predicates = [HasAVX512] in {
+ defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
+ avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ }
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop> {
+ defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, $src1", "$src1, $src2",
+ (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>, AVX512AIi8Base;
+ let hasSideEffects = 0, mayStore = 1 in {
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2), (i32 FROUND_CURRENT) )),
+ addr:$dst)]>;
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K;
+ }
+}
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+ defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}",
+ (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base;
+}
+let Predicates = [HasAVX512] in {
+ defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
+ avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ let Predicates = [HasVLX] in {
+ defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
+ EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+ EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ }
+}
+
+// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+ string OpcodeStr> {
+ def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+ [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2,
+ (i32 FROUND_NO_EXC)))],
+ IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+ Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+ "ucomiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+ "ucomisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ let Pattern = []<dag> in {
+ defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
+ "comiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
+ "comisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ }
+ let isCodeGenOnly = 1 in {
+ defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+ load, "ucomiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+ load, "ucomisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+ defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+ load, "comiss">, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+ load, "comisd">, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
+ }
+}
+}
+
+defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
+ let mayLoad = 1 in {
+ defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+ defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ (OpNode (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX, T8PD, EVEX_B;
+ }
+}
+
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode> {
+
+ defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT))>;
+
+ defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+
+ defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+}
+
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
+ EVEX_CD8<32, CD8VT1>;
+ defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
+ EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+let hasSideEffects = 0, Predicates = [HasERI] in {
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
+}
+
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ SDNode OpNode> {
+
+ defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
+
+ defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src))),
+ (i32 FROUND_CURRENT))>;
+
+ defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr,
+ "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ (OpNode (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+}
+multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ SDNode OpNode> {
+ defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr,
+ "{sae}, $src", "$src, {sae}",
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+ avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+ T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+ avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+ T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
+ EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
+ EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
+ EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
+ EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ }
+}
+let Predicates = [HasERI], hasSideEffects = 0 in {
+
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX;
+}
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
+ avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;
+
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+ SDNode OpNodeRnd, X86VectorVTInfo _>{
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
+ (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.FloatVT (OpNode _.RC:$src))>, EVEX;
+ let mayLoad = 1 in {
+ defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX;
+
+ defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ (OpNode (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX, EVEX_B;
+ }
+}
+
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ v8f64_info>,
+ EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v4f32x_info>,
+ EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, v8f32x_info>,
+ EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v2f64x_info>,
+ EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, v4f64x_info>,
+ EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+ SDNode OpNodeRnd> {
+ defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
+ v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
+ v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$rc))>,
+ EVEX_B, EVEX_RC;
+
+ let isCodeGenOnly = 1 in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+ }
+
+ def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+ (!cast<Instruction>(NAME#SUFF#Zr)
+ (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+ def : Pat<(_.EltVT (OpNode (load addr:$src))),
+ (!cast<Instruction>(NAME#SUFF#Zm)
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
+ avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
+
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(f32 (X86frsqrt FR32X:$src)),
+ (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
+ def : Pat<(f32 (X86frsqrt (load addr:$src))),
+ (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[OptForSize]>;
+ def : Pat<(f32 (X86frcp FR32X:$src)),
+ (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
+ def : Pat<(f32 (X86frcp (load addr:$src))),
+ (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[OptForSize]>;
+}
+
+multiclass
+avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+ let ExeDomain = _.ExeDomain in {
+ defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+
+ defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
+
+ let mayLoad = 1 in
+ defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+ }
+ let Predicates = [HasAVX512] in {
+ def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+ def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+ def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+ def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
+ def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;
+
+ def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x1))), _.FRC)>;
+ def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x2))), _.FRC)>;
+ def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x3))), _.FRC)>;
+ def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x4))), _.FRC)>;
+ def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+ (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+ addr:$src, (i32 0xc))), _.FRC)>;
+ }
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
+ X86MemOperand x86memop> {
+
+ defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+ EVEX, T8XS;
+
+ // for intrinsic patter match
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ undef)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ DestInfo.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ DestInfo.RC:$src0)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
+ DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ let mayStore = 1 in {
+ def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst |$dst, $src}",
+ []>, EVEX;
+
+ def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+ []>, EVEX, EVEX_K;
+ }//mayStore = 1
+}
+
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo,
+ PatFrag truncFrag, PatFrag mtruncFrag > {
+
+ def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+ addr:$dst, SrcInfo.RC:$src)>;
+
+ def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
+ (SrcInfo.VT SrcInfo.RC:$src)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+ addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo, string sat > {
+
+ def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+ DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+ addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr,
+ (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM),
+ (SrcInfo.VT SrcInfo.RC:$src))>;
+
+ def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+ DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+ addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr,
+ (SrcInfo.VT SrcInfo.RC:$src))>;
+}
+
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
+ Predicate prd = HasAVX512>{
+
+ let Predicates = [HasVLX, prd] in {
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+ DestInfoZ128, x86memopZ128>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+ truncFrag, mtruncFrag>, EVEX_V128;
+
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+ DestInfoZ256, x86memopZ256>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+ truncFrag, mtruncFrag>, EVEX_V256;
+ }
+ let Predicates = [prd] in
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+ DestInfoZ, x86memopZ>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+ truncFrag, mtruncFrag>, EVEX_V512;
+}
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{
+
+ let Predicates = [HasVLX, prd] in {
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+ DestInfoZ128, x86memopZ128>,
+ avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+ sat>, EVEX_V128;
+
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+ DestInfoZ256, x86memopZ256>,
+ avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+ sat>, EVEX_V256;
+ }
+ let Predicates = [prd] in
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+ DestInfoZ, x86memopZ>,
+ avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+ sat>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+ truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>;
+}
+multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+ sat>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+ truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>;
+}
+multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info,
+ v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+ sat>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+ truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>;
+}
+multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info,
+ v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+ sat>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+ truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>;
+}
+multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+ sat>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+ truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info,
+ v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+ sat>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+ truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info,
+ v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+ sat, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>;
+defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>;
+defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>;
+
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>;
+defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>;
+defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>;
+
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>;
+defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>;
+defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>;
+
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>;
+defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>;
+defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>;
+
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>;
+defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>;
+defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>;
+
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>;
+defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>;
+defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+ (v8i16 (EXTRACT_SUBREG
+ (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+ (v4i32 (EXTRACT_SUBREG
+ (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+ (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0),
+ VR256X:$src, sub_ymm))), sub_xmm))>;
+}
+
+multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
+ X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
+
+ defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+ EVEX;
+
+ let mayLoad = 1 in {
+ defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins x86memop:$src), OpcodeStr ,"$src", "$src",
+ (DestInfo.VT (LdFrag addr:$src))>,
+ EVEX;
+ }
+}
+
+multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasBWI] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info,
+ v16i8x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info,
+ v16i8x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasBWI] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info,
+ v32i8x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+ v16i8x_info, i32mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+ v16i8x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
+ v16i8x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ v16i8x_info, i16mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ v16i8x_info, i32mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ v16i8x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+ v8i16x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+ v8i16x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
+ v16i16x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ v8i16x_info, i32mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ v8i16x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ v8i16x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512;
+ }
+}
+
+multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ v4i32x_info, i64mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ v4i32x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ v8i32x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+
+
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86MemOperand memop, PatFrag GatherNode> {
+ let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
+ ExeDomain = _.ExeDomain in
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
+ (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
+ !strconcat(OpcodeStr#_.Suffix,
+ "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+ [(set _.RC:$dst, _.KRCWM:$mask_wb,
+ (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask,
+ vectoraddr:$src2))]>, EVEX, EVEX_K,
+ EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
+ vy32xmem, mgatherv8i32>, EVEX_V512, VEX_W;
+ defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
+ vz64mem, mgatherv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+ vx32xmem, mgatherv4i32>, EVEX_V256, VEX_W;
+ defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
+ vy64xmem, mgatherv4i64>, EVEX_V256, VEX_W;
+ defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+ vx32xmem, mgatherv4i32>, EVEX_V128, VEX_W;
+ defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+ vx64xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz32mem,
+ mgatherv16i32>, EVEX_V512;
+ defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz64mem,
+ mgatherv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+ vy32xmem, mgatherv8i32>, EVEX_V256;
+ defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+ vy64xmem, mgatherv4i64>, EVEX_V256;
+ defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+ vx32xmem, mgatherv4i32>, EVEX_V128;
+ defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+ vx64xmem, mgatherv2i64>, EVEX_V128;
+}
+}
+
+
+defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
+ avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;
+
+defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
+ avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86MemOperand memop, PatFrag ScatterNode> {
+
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+
+ def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
+ (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
+ !strconcat(OpcodeStr#_.Suffix,
+ "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+ [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+ _.KRCWM:$mask, vectoraddr:$dst))]>,
+ EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
+ vy32xmem, mscatterv8i32>, EVEX_V512, VEX_W;
+ defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
+ vz64mem, mscatterv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+ vx32xmem, mscatterv4i32>, EVEX_V256, VEX_W;
+ defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
+ vy64xmem, mscatterv4i64>, EVEX_V256, VEX_W;
+ defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+ vx32xmem, mscatterv4i32>, EVEX_V128, VEX_W;
+ defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+ vx64xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz32mem,
+ mscatterv16i32>, EVEX_V512;
+ defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz64mem,
+ mscatterv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+ defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+ vy32xmem, mscatterv8i32>, EVEX_V256;
+ defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+ vy64xmem, mscatterv4i64>, EVEX_V256;
+ defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+ vx32xmem, mscatterv4i32>, EVEX_V128;
+ defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+ vx64xmem, mscatterv2i64>, EVEX_V128;
+}
+}
+
+defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">,
+ avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">;
+
+defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">,
+ avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">;
+
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+ RegisterClass KRC, X86MemOperand memop> {
+ let Predicates = [HasPFI], hasSideEffects = 1 in
+ def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+ !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(store VK1:$src, addr:$dst),
+ (MOV8mr addr:$dst,
+ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+ sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(store VK8:$src, addr:$dst),
+ (MOV8mr addr:$dst,
+ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+ sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
+
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
+}]>;
+
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+ (MOV8mr addr:$dst, GR8:$src)>;
+
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
+ !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+ [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
+}
+
+multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
+ string OpcodeStr, Predicate prd> {
+let Predicates = [prd] in
+ defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+ defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ }
+}
+
+multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
+ defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr,
+ HasBWI>;
+ defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
+ HasBWI>, VEX_W;
+ defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
+ HasDQI>;
+ defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
+ HasDQI>, VEX_W;
+}
+
+defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+
+multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+}
+
+multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+let Predicates = [prd] in
+ defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
+ EVEX_V256;
+ defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
+ EVEX_V128;
+ }
+}
+
+defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m",
+ avx512vl_i8_info, HasBWI>;
+defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m",
+ avx512vl_i16_info, HasBWI>, VEX_W;
+defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
+ avx512vl_i32_info, HasDQI>;
+defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
+ avx512vl_i64_info, HasDQI>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - COMPRESS and EXPAND
+//
+
+multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr> {
+ defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
+
+ let mayStore = 1 in {
+ def mr : AVX5128I<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst |$dst, $src}",
+ []>, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ def mrk : AVX5128I<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+ [(store (_.VT (vselect _.KRCWM:$mask,
+ (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)),
+ addr:$dst)]>,
+ EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+ }
+}
+
+multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+ defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ }
+}
+
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
+ EVEX;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
+ EVEX, VEX_W;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
+ EVEX;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
+ EVEX, VEX_W;
+
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
+
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
+ (_.VT (X86expand (_.VT (bitconvert
+ (_.LdFrag addr:$src1)))))>,
+ AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ }
+}
+
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
+ EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
+ EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
+ EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
+ EVEX, VEX_W;
+
+//handle instruction reg_vec1 = op(reg_vec,imm)
+// op(mem_vec,imm)
+// op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+ "${src1}"##_.BroadcastStr##", $src2",
+ (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2,{sae}, $src1",
+ "$src1, {sae}, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+// op(reg_vec2,broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+
+ defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT SrcInfo.RC:$src2),
+ (i8 imm:$src3)))>;
+ let mayLoad = 1 in
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT (bitconvert
+ (SrcInfo.LdFrag addr:$src2))),
+ (i8 imm:$src3)))>;
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+// op(reg_vec2,broadcast(eltVt),imm)
+multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>:
+ avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+
+ let mayLoad = 1 in
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ (i8 imm:$src3))>, EVEX_B;
+}
+
+//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_scalar,imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+
+ defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 imm:$src3),
+ (i32 FROUND_CURRENT))>;
+
+ let isAsmParserOnly = 1 in {
+ defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ []>;
+ }
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3,{sae}, $src2, $src1",
+ "$src1, $src2,{sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _> {
+ defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3,{sae}, $src2, $src1",
+ "$src1, $src2,{sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512;
+
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256;
+ }
+}
+
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+ AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
+ let Predicates = [HasBWI] in {
+ defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+ SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+ }
+ let Predicates = [HasBWI, HasVLX] in {
+ defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+ SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+ defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256,
+ SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+ }
+}
+
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
+ X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
+ }
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+ bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+ opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+ opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd",
+ avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VFIXUPIMMPS : avx512_common_fp_sae_packed_imm<"vfixupimmps",
+ avx512vl_f32_info, 0x54, X86VFixupimm, HasAVX512>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VFIXUPIMMSD: avx512_common_fp_sae_scalar_imm<"vfixupimmsd", f64x_info,
+ 0x55, X86VFixupimm, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info,
+ 0x55, X86VFixupimm, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+ X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+ X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+ X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
+
+
+defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
+ 0x50, X86VRange, HasDQI>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
+ 0x50, X86VRange, HasDQI>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
+ 0x51, X86VRange, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
+ 0x51, X86VRange, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+ 0x57, X86Reduces, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+ 0x57, X86Reduces, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+ 0x27, X86GetMants, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+ 0x27, X86GetMants, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode = X86Shuf128>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (ffloor VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
+ EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+ let Predicates = p in
+ def NAME#_.VTName#rri:
+ Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+ (!cast<Instruction>(NAME#_.ZSuffix#rri)
+ _.RC:$src1, _.RC:$src2, imm:$imm)>;
+}
+
+multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>:
+ avx512_vpalign_lowering<_.info512, [HasBWI]>,
+ avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>,
+ avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>;
+
+defm VPALIGN: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+ avx512vl_i8_info, avx512vl_i8_info>,
+ avx512_vpalign_lowering_common<avx512vl_i16_info>,
+ avx512_vpalign_lowering_common<avx512vl_i32_info>,
+ avx512_vpalign_lowering_common<avx512vl_f32_info>,
+ avx512_vpalign_lowering_common<avx512vl_i64_info>,
+ avx512_vpalign_lowering_common<avx512vl_f64_info>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
+ avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
+multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
+
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+ EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> :
+ avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
+ let mayLoad = 1 in
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1), OpcodeStr,
+ "${src1}"##_.BroadcastStr,
+ "${src1}"##_.BroadcastStr,
+ (_.VT (OpNode (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))))>,
+ EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, Predicate prd> {
+ defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
+ prd>, VEX_W;
+ defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
+ prd>;
+}
+
+multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+ SDNode OpNode, Predicate prd> {
+ defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
+ defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
+}
+
+multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+ bits<8> opc_d, bits<8> opc_q,
+ string OpcodeStr, SDNode OpNode> {
+ defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+ HasAVX512>,
+ avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+ HasBWI>;
+}
+
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
+
+def : Pat<(xor
+ (bc_v16i32 (v16i1sextv16i32)),
+ (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+ (VPABSDZrr VR512:$src)>;
+def : Pat<(xor
+ (bc_v8i64 (v8i1sextv8i64)),
+ (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
+ (VPABSQZrr VR512:$src)>;
+
+multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+
+ defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+}
+
+defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+ defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
+ HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src)))))>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+ defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode,
+ avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+
+def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>;
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let mayStore = 1 in
+ def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+ imm:$src2)))),
+ addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD;
+
+ defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, PD;
+
+ def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX, TAPD;
+
+ defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+ RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GRC:$dst,
+ (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD;
+
+ let mayStore = 1 in
+ def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (_.VT _.RC:$src1),
+ imm:$src2),addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+ }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+ }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+ EVEX_4V, TAPD;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+ _.ScalarLdFrag>, TAPD;
+ }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+ extloadi8>, TAPD;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+ extloadi16>, PD;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
+ EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+ def rr : AVX512<opc, MRMr,
+ (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+ let mayLoad = 1 in
+ def rm : AVX512<opc, MRMm,
+ (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode
+ (_.LdFrag addr:$src1), (i8 imm:$src2))))]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr, Predicate prd>{
+ let Predicates = [prd] in
+ defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v8i64_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v4i64x_info>, EVEX_V256;
+ defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v2i64x_info>, EVEX_V128;
+ }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
+ HasBWI>, AVX512PDIi8Base, EVEX_4V;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
+ HasBWI>, AVX512PDIi8Base, EVEX_4V;
+
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, X86VectorVTInfo _dst,
+ X86VectorVTInfo _src>{
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT _src.RC:$src2))))]>;
+ let mayLoad = 1 in
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT (bitconvert
+ (_src.LdFrag addr:$src2))))))]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
+ v64i8_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+ v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+ v16i8x_info>, EVEX_V128;
+ }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
+ HasBWI>, EVEX_4V;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ let Constraints = "$src1 = $dst" in {
+ defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT _.RC:$src3),
+ (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (bitconvert (_.LdFrag addr:$src3))),
+ (i8 imm:$src4))>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr##", $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (i8 imm:$src4))>, EVEX_B,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ }
+ }// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
+ defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+ }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 0000000..1a2e786
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1375 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let hasSideEffects = 0 in
+def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins anymem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+let isReMaterializable = 1 in
+def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins anymem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+ OpSize32, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+ // Memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Register reads (implicit or explicit).
+ ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src)),
+ (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
+def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
+ "mul{w}\t$src",
+ [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
+def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
+ "mul{l}\t$src",
+ [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+ IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+ "mul{q}\t$src",
+ [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+ IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+ "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src))),
+ (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, hasSideEffects = 0 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+ "mul{w}\t$src",
+ [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+ "mul{l}\t$src",
+ [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+ "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let hasSideEffects = 0 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [],
+ IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [],
+ IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [],
+ IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+ IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+ "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+ "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
+ SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+ "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
+ SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+ "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // hasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+ TB, OpSize16;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+ TB, OpSize32;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+ TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, (load addr:$src2)))],
+ IIC_IMUL16_RM>,
+ TB, OpSize16;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, (load addr:$src2)))],
+ IIC_IMUL32_RM>,
+ TB, OpSize32;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, (load addr:$src2)))],
+ IIC_IMUL64_RM>,
+ TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))],
+ IIC_IMUL16_RRI>, OpSize16;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+ IIC_IMUL16_RRI>, OpSize16;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, imm:$src2))],
+ IIC_IMUL32_RRI>, OpSize32;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+ IIC_IMUL32_RRI>, OpSize32;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+ IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+ IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL16_RMI>,
+ OpSize16;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+ OpSize16;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL32_RMI>, OpSize32;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i32immSExt8:$src2))],
+ IIC_IMUL32_RMI>, OpSize32;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt32:$src2))],
+ IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt8:$src2))],
+ IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+ "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b}\t$src", [], IIC_DIV8_MEM>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w}\t$src", [], IIC_DIV16>, OpSize16,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+ "div{l}\t$src", [], IIC_DIV32>,
+ SchedLoadReg<WriteIDivLd>, OpSize32;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+ "div{q}\t$src", [], IIC_DIV64>,
+ SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+ "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b}\t$src", [], IIC_IDIV8>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+ "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+ "idiv{q}\t$src", [], IIC_IDIV64>,
+ SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "neg{b}\t$dst",
+ [(set GR8:$dst, (ineg GR8:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "neg{w}\t$dst",
+ [(set GR16:$dst, (ineg GR16:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "neg{l}\t$dst",
+ [(set GR32:$dst, (ineg GR32:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+ [(set GR64:$dst, (ineg GR64:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+ "neg{b}\t$dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+ "neg{w}\t$dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+ "neg{l}\t$dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "not{b}\t$dst",
+ [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "not{w}\t$dst",
+ [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "not{l}\t$dst",
+ [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+ [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+ "not{b}\t$dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+ "not{w}\t$dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ OpSize16;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+ "not{l}\t$dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ OpSize32;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "inc{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+ IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+ IIC_UNARY_REG>, OpSize16;
+def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+ IIC_UNARY_REG>, OpSize32;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+ IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst", [], IIC_UNARY_REG>,
+ OpSize16, Requires<[Not64BitMode]>;
+def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst", [], IIC_UNARY_REG>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+ def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+ def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "dec{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+ IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+ IIC_UNARY_REG>, OpSize16;
+def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+ IIC_UNARY_REG>, OpSize32;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+ IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst", [], IIC_UNARY_REG>,
+ OpSize16, Requires<[Not64BitMode]>;
+def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst", [], IIC_UNARY_REG>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+ def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+ def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+ def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+ def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types. For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+ PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+ Operand immoperand, SDPatternOperator immoperator,
+ Operand imm8operand, SDPatternOperator imm8operator,
+ bit hasOddOpcode, OperandSize opSize,
+ bit hasREX_WPrefix> {
+ /// VT - This is the value type itself.
+ ValueType VT = vt;
+
+ /// InstrSuffix - This is the suffix used on instructions with this type. For
+ /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+ string InstrSuffix = instrsuffix;
+
+ /// RegClass - This is the register class associated with this type. For
+ /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+ RegisterClass RegClass = regclass;
+
+ /// LoadNode - This is the load node associated with this type. For
+ /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+ PatFrag LoadNode = loadnode;
+
+ /// MemOperand - This is the memory operand associated with this type. For
+ /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+ X86MemOperand MemOperand = memoperand;
+
+ /// ImmEncoding - This is the encoding of an immediate of this type. For
+ /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32
+ /// since the immediate fields of i64 instructions is a 32-bit sign extended
+ /// value.
+ ImmType ImmEncoding = immkind;
+
+ /// ImmOperand - This is the operand kind of an immediate of this type. For
+ /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 ->
+ /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+ /// extended value.
+ Operand ImmOperand = immoperand;
+
+ /// ImmOperator - This is the operator that should be used to match an
+ /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+ SDPatternOperator ImmOperator = immoperator;
+
+ /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+ /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is
+ /// only used for instructions that have a sign-extended imm8 field form.
+ Operand Imm8Operand = imm8operand;
+
+ /// Imm8Operator - This is the operator that should be used to match an 8-bit
+ /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+ SDPatternOperator Imm8Operator = imm8operator;
+
+ /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+ /// opposed to even) opcode. Operations on i8 are usually even, operations on
+ /// other datatypes are odd.
+ bit HasOddOpcode = hasOddOpcode;
+
+ /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+ /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+ /// to Opsize16. i32 sets this to OpSize32.
+ OperandSize OpSize = opSize;
+
+ /// HasREX_WPrefix - This bit is set to true if the instruction should have
+ /// the 0x40 REX prefix. This is set for i64 types.
+ bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+ Imm8, i8imm, imm8_su, i8imm, invalid_node,
+ 0, OpSizeFixed, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+ Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
+ 1, OpSize16, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+ Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
+ 1, OpSize32, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+ Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+ 1, OpSizeFixed, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+/// suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+/// or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+ string mnemonic, string args, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+ opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+ f, outs, ins,
+ !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+ itin> {
+
+ // Infer instruction prefixes from type info.
+ let OpSize = typeinfo.OpSize;
+ let hasREX_WPrefix = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, list<dag> pattern, InstrItinClass itin,
+ Format f = MRMDestReg>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f = MRMDestReg>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+ EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, MRMSrcReg, typeinfo,
+ (outs typeinfo.RegClass:$dst),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let ForceDisassemble = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let ForceDisassemble = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+ EFLAGS))], IIC_BIN_CARRY_MEM>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+ EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+ EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<opcode, MRMDestMem, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+ Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+ addr:$dst),
+ (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<opcode, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+ Sched<[WriteALULd, WriteRMW]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src))]>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_MEM>
+ : ITy<0x82, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+ Sched<[WriteALULd, WriteRMW]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : ITy<opcode, RawFrm, typeinfo,
+ (outs), (ins typeinfo.ImmOperand:$src),
+ mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+ let ImmT = typeinfo.ImmEncoding;
+ let Uses = [areg];
+ let Defs = [areg, EFLAGS];
+ let hasSideEffects = 0;
+}
+
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+ IIC_BIN_CARRY_NONMEM> {
+ let Uses = [areg, EFLAGS];
+}
+
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+ let Defs = [EFLAGS];
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnodeflag, SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+ } // isConvertibleToThreeAddress
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+ def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+ def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst" in
+ def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1, mayStore = 1 in
+ def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode, bit CommutableRR,
+ bit ConvertibleToThreeAddress> {
+ let Uses = [EFLAGS], Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+ } // isConvertibleToThreeAddress
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst" in
+ def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1, mayStore = 1 in
+ def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...". It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ }
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+
+ def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1 in
+ def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+ X86and_flag, and, 1, 0>;
+defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+ X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+ X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+ X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+ X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+ 1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+ 0, 0>;
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result. From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1 in {
+ let Defs = [EFLAGS] in {
+ let isCommutable = 1 in {
+ def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>;
+ } // isCommutable
+
+ def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+ def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+ def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+ def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+ def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+ def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+ def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+ def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+ def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+
+ // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+ // register class is constrained to GR8_NOREX. This pseudo is explicitly
+ // marked side-effect free, since it doesn't have an isel pattern like
+ // other test instructions.
+ let isPseudo = 1, hasSideEffects = 0 in
+ def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+ "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ } // Defs = [EFLAGS]
+
+ def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+} // isCompare
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+ PatFrag ld_frag> {
+ def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+ IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS,
+ (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+ Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
+ defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+ def : Pat<(and (not GR32:$src1), GR32:$src2),
+ (ANDN32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(and (not GR64:$src1), GR64:$src2),
+ (ANDN64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+ (ANDN32rm GR32:$src1, addr:$src2)>;
+ def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+ (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ let isCommutable = 1 in
+ def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+
+ let mayLoad = 1 in
+ def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ let Uses = [EDX] in
+ defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+ let Uses = [RDX] in
+ defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
+ Constraints = "$src0 = $dst", AddedComplexity = 10 in {
+ let SchedRW = [WriteALU] in {
+ def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS,
+ (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
+ IIC_BIN_CARRY_NONMEM>, T8PD;
+ def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS,
+ (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
+ IIC_BIN_CARRY_NONMEM>, T8PD;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
+ def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS,
+ (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
+ IIC_BIN_CARRY_MEM>, T8PD;
+
+ def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS,
+ (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
+ IIC_BIN_CARRY_MEM>, T8PD;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
+ Uses = [EFLAGS] in {
+ let SchedRW = [WriteALU] in {
+ def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+
+ def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
+ def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+
+ def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..787f15b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,183 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ int Disp;
+ const GlobalValue *GV;
+ unsigned GVOpFlags;
+
+ X86AddressMode()
+ : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+ GVOpFlags(0) {
+ Base.Reg = 0;
+ }
+
+
+ void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+ assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+ if (BaseType == X86AddressMode::RegBase)
+ MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
+ false, false, false, 0, false));
+ else {
+ assert(BaseType == X86AddressMode::FrameIndexBase);
+ MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+ }
+
+ MO.push_back(MachineOperand::CreateImm(Scale));
+ MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
+ false, false, false, 0, false));
+
+ if (GV)
+ MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+ else
+ MO.push_back(MachineOperand::CreateImm(Disp));
+
+ MO.push_back(MachineOperand::CreateReg(0, false, false,
+ false, false, false, 0, false));
+ }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+ // Because memory references are always represented with five
+ // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill, int Offset) {
+ return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, bool isKill1,
+ unsigned Reg2, bool isKill2) {
+ return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+ .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else {
+ assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ }
+
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+ else
+ MIB.addImm(AM.Disp);
+
+ return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ unsigned Flags = 0;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ return addOffset(MIB.addFrameIndex(FI), Offset)
+ .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ unsigned GlobalBaseReg, unsigned char OpFlags) {
+ //FIXME: factor this
+ return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+ .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 0000000..c73c950
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,112 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// CMOV instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ isCommutable = 1, SchedRW = [WriteALU] in {
+ def NAME#16rr
+ : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+ IIC_CMOV16_RR>, TB, OpSize16;
+ def NAME#32rr
+ : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB, OpSize32;
+ def NAME#64rr
+ :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR64:$dst,
+ (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB;
+ }
+
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteALULd, ReadAfterLd] in {
+ def NAME#16rm
+ : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV16_RM>,
+ TB, OpSize16;
+ def NAME#32rm
+ : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV32_RM>,
+ TB, OpSize32;
+ def NAME#64rm
+ :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+ } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+ let Uses = [EFLAGS] in {
+ def r : I<opc, MRMXr, (outs GR8:$dst), (ins),
+ !strconcat(Mnemonic, "\t$dst"),
+ [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+ IIC_SET_R>, TB, Sched<[WriteALU]>;
+ def m : I<opc, MRMXm, (outs), (ins i8mem:$dst),
+ !strconcat(Mnemonic, "\t$dst"),
+ [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+ IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+ } // Uses = [EFLAGS]
+}
+
+defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set
+defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal
+defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal
+defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than
+defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed
+defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set
+defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal
+defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 0000000..96a29ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1864 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 32 bits.
+ return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 8 bits.
+ return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction. This expands to code that looks like this:
+// call $next_inst
+// popl %destreg"
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
+ def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+ "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKDOWN",
+ []>,
+ Requires<[NotLP64]>;
+def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[NotLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKDOWN",
+ []>,
+ Requires<[IsLP64]>;
+def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[IsLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+ (outs),
+ (ins GR8:$al,
+ i64imm:$regsavefi, i64imm:$offset,
+ variable_ops),
+ "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+ [(X86vastart_save_xmm_regs GR8:$al,
+ imm:$regsavefi,
+ imm:$offset),
+ (implicit EFLAGS)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+ (outs GR64:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_64 $dst, $ap, $size, $mode, $align",
+ [(set GR64:$dst,
+ (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+ (implicit EFLAGS)]>;
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets. These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+ def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
+ "# dynamic stack allocation",
+ [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR32:$dst,
+ (X86SegAlloca GR32:$size))]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR64:$dst,
+ (X86SegAlloca GR64:$size))]>,
+ Requires<[In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+ isCodeGenOnly = 1, isReturn = 1 in {
+ def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+ // CATCHRET needs a custom inserter for SEH.
+ let usesCustomInserter = 1 in
+ def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+ "# CATCHRET",
+ [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in
+def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
+
+// This instruction is responsible for re-establishing stack pointers after an
+// exception has been caught and we are rejoining normal control flow in the
+// parent function or funclet. It generally sets ESP and EBP, and optionally
+// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
+// elsewhere.
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
+def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[Not64BitMode]>;
+ def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+ "#EH_SJLJ_SETJMP64",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In64BitMode]>;
+ let isTerminator = 1 in {
+ def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[Not64BitMode]>;
+ def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+ "#EH_SJLJ_LONGJMP64",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In64BitMode]>;
+ }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+ def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+ "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+ def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+ "#SEH_PushReg $reg", []>;
+ def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveReg $reg, $dst", []>;
+ def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveXMM $reg, $dst", []>;
+ def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+ "#SEH_StackAlloc $size", []>;
+ def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+ "#SEH_SetFrame $reg, $offset", []>;
+ def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+ "#SEH_PushFrame $mode", []>;
+ def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+ "#SEH_EndPrologue", []>;
+ def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
+ "#SEH_Epilogue", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower. We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+ "", []>;
+
+// This instruction is lowered to a RET followed by a MOV. The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+ "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instruction mapping movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1 in
+def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+ let AddedComplexity = 20;
+}
+
+let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+ AddedComplexity = 1 in {
+ // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+ // which only require 3 bytes compared to MOV32ri which requires 5.
+ let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+ def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 1)]>;
+ def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, -1)]>;
+ }
+
+ // MOV16ri is 4 bytes, so the instructions above are smaller.
+ def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+ def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, hasSideEffects = 0 in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+ [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+ [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+ [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type. When
+// this happens, it is great. However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+ (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+ (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+ (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+ (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ Requires<[Not64BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+ Requires<[Not64BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+ Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+ Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+ Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+ [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,ECX,EDI] in
+ def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ Requires<[Not64BitMode]>;
+ let Uses = [AX,ECX,EDI] in
+ def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+ Requires<[Not64BitMode]>;
+ let Uses = [EAX,ECX,EDI] in
+ def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+ Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,RCX,RDI] in
+ def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+ let Uses = [AX,RCX,RDI] in
+ def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+ Requires<[In64BitMode]>;
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+ Requires<[In64BitMode]>;
+
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+ [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addr32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[Not64BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addr32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[Not64BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_addr64",
+ [(X86tlsaddr tls64addr:$sym)]>,
+ Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_base_addr64",
+ [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+ Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax. %ecx is trashed during the function
+// call. All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+ Uses = [ESP],
+ usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLSCall_32",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[Not64BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, on return
+// the address of the variable is in %rax. All other registers are preserved.
+let Defs = [RAX, EFLAGS],
+ Uses = [RSP, RDI],
+ usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLSCall_64",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// CMOV* - Used to implement the SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
+ def CMOV#NAME : I<0, Pseudo,
+ (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
+ "#CMOV_"#NAME#" PSEUDO!",
+ [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+ EFLAGS)))]>;
+}
+
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+ // X86 doesn't have 8-bit conditional moves. Use a customInserter to
+ // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+ // however that requires promoting the operands, and can induce additional
+ // i8 register pressure.
+ defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
+
+ let Predicates = [NoCMov] in {
+ defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
+ defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
+ } // Predicates = [NoCMov]
+
+ // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+ // SSE1/SSE2.
+ let Predicates = [FPStackf32] in
+ defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;
+
+ let Predicates = [FPStackf64] in
+ defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;
+
+ defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
+
+ defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
+ defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ defm _FR128 : CMOVrr_PSEUDO<FR128, f128>;
+ defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>;
+ defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>;
+ defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>;
+ defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>;
+ defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>;
+ defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>;
+ defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>;
+ defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>;
+ defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>;
+ defm _V8I1 : CMOVrr_PSEUDO<VK8, v8i1>;
+ defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>;
+ defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>;
+ defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>;
+} // usesCustomInserter = 1, Uses = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+ "or{l}\t{$zero, $dst|$dst, $zero}", [],
+ IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
+ Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+ "#MEMBARRIER",
+ [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+ Format ImmMod, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+ MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, LOCK;
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, OpSize16, LOCK;
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, OpSize32, LOCK;
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+ ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize16, LOCK;
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize32, LOCK;
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
+
+// Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
+ !strconcat(mnemonic, "{b}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+ !strconcat(mnemonic, "{w}\t$dst"),
+ [], IIC_UNARY_MEM>, OpSize16, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+ !strconcat(mnemonic, "{l}\t$dst"),
+ [], IIC_UNARY_MEM>, OpSize32, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+ !strconcat(mnemonic, "{q}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+ SDPatternOperator frag, X86MemOperand x86memop,
+ InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+ def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+ !strconcat(mnemonic, "\t$ptr"),
+ [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ string mnemonic, SDPatternOperator frag,
+ InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+ let Defs = [AL, EFLAGS], Uses = [AL] in
+ def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+ !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+ let Defs = [AX, EFLAGS], Uses = [AX] in
+ def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+ !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+ let Defs = [EAX, EFLAGS], Uses = [EAX] in
+ def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+ !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+ let Defs = [RAX, EFLAGS], Uses = [RAX] in
+ def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+ !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+ SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+ X86cas8, i64mem,
+ IIC_CMPX_LOCK_8B>;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+ Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+ X86cas16, i128mem,
+ IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+ X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+ string frag,
+ InstrItinClass itin8, InstrItinClass itin> {
+ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin8>;
+ def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize16;
+ def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>, OpSize32;
+ def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+ itin>;
+ }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+ IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+ TB, LOCK;
+
+/* The following multiclass tries to make sure that in code like
+ * x.store (immediate op x.load(acquire), release)
+ * and
+ * x.store (register op x.load(acquire), release)
+ * an operation directly on memory is generated instead of wasting a register.
+ * It is not automatic as atomic_store/load are only lowered to MOV instructions
+ * extremely late to prevent them from being accidentally reordered in the backend
+ * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
+ */
+multiclass RELEASE_BINOP_MI<SDNode op> {
+ def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+ "#BINOP "#NAME#"8mi PSEUDO!",
+ [(atomic_store_8 addr:$dst, (op
+ (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+ def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+ "#BINOP "#NAME#"8mr PSEUDO!",
+ [(atomic_store_8 addr:$dst, (op
+ (atomic_load_8 addr:$dst), GR8:$src))]>;
+ // NAME#16 is not generated as 16-bit arithmetic instructions are considered
+ // costly and avoided as far as possible by this backend anyway
+ def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+ "#BINOP "#NAME#"32mi PSEUDO!",
+ [(atomic_store_32 addr:$dst, (op
+ (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+ def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+ "#BINOP "#NAME#"32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst, (op
+ (atomic_load_32 addr:$dst), GR32:$src))]>;
+ def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "#BINOP "#NAME#"64mi32 PSEUDO!",
+ [(atomic_store_64 addr:$dst, (op
+ (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+ def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+ "#BINOP "#NAME#"64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst, (op
+ (atomic_load_64 addr:$dst), GR64:$src))]>;
+}
+let Defs = [EFLAGS] in {
+ defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
+ defm RELEASE_AND : RELEASE_BINOP_MI<and>;
+ defm RELEASE_OR : RELEASE_BINOP_MI<or>;
+ defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
+ // Note: we don't deal with sub, because substractions of constants are
+ // optimized into additions before this code can run.
+}
+
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<SDNode op> {
+ def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+ "#BINOP "#NAME#"32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst,
+ (i32 (bitconvert (op
+ (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+ FR32:$src))))]>, Requires<[HasSSE1]>;
+ def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+ "#BINOP "#NAME#"64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst,
+ (i64 (bitconvert (op
+ (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+ FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
+}
+
+multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
+ def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
+ "#UNOP "#NAME#"8m PSEUDO!",
+ [(atomic_store_8 addr:$dst, dag8)]>;
+ def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
+ "#UNOP "#NAME#"16m PSEUDO!",
+ [(atomic_store_16 addr:$dst, dag16)]>;
+ def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
+ "#UNOP "#NAME#"32m PSEUDO!",
+ [(atomic_store_32 addr:$dst, dag32)]>;
+ def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
+ "#UNOP "#NAME#"64m PSEUDO!",
+ [(atomic_store_64 addr:$dst, dag64)]>;
+}
+
+let Defs = [EFLAGS] in {
+ defm RELEASE_INC : RELEASE_UNOP<
+ (add (atomic_load_8 addr:$dst), (i8 1)),
+ (add (atomic_load_16 addr:$dst), (i16 1)),
+ (add (atomic_load_32 addr:$dst), (i32 1)),
+ (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+ defm RELEASE_DEC : RELEASE_UNOP<
+ (add (atomic_load_8 addr:$dst), (i8 -1)),
+ (add (atomic_load_16 addr:$dst), (i16 -1)),
+ (add (atomic_load_32 addr:$dst), (i32 -1)),
+ (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+}
+/*
+TODO: These don't work because the type inference of TableGen fails.
+TODO: find a way to fix it.
+let Defs = [EFLAGS] in {
+ defm RELEASE_NEG : RELEASE_UNOP<
+ (ineg (atomic_load_8 addr:$dst)),
+ (ineg (atomic_load_16 addr:$dst)),
+ (ineg (atomic_load_32 addr:$dst)),
+ (ineg (atomic_load_64 addr:$dst))>;
+}
+// NOT doesn't set flags.
+defm RELEASE_NOT : RELEASE_UNOP<
+ (not (atomic_load_8 addr:$dst)),
+ (not (atomic_load_16 addr:$dst)),
+ (not (atomic_load_32 addr:$dst)),
+ (not (atomic_load_64 addr:$dst))>;
+*/
+
+def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+ "#RELEASE_MOV8mi PSEUDO!",
+ [(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
+def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
+ "#RELEASE_MOV16mi PSEUDO!",
+ [(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
+def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+ "#RELEASE_MOV32mi PSEUDO!",
+ [(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
+def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "#RELEASE_MOV64mi32 PSEUDO!",
+ [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
+
+def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+ "#RELEASE_MOV8mr PSEUDO!",
+ [(atomic_store_8 addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+ "#RELEASE_MOV16mr PSEUDO!",
+ [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+ "#RELEASE_MOV32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+ "#RELEASE_MOV64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+ "#ACQUIRE_MOV8rm PSEUDO!",
+ [(set GR8:$dst, (atomic_load_8 addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+ "#ACQUIRE_MOV16rm PSEUDO!",
+ [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+ "#ACQUIRE_MOV32rm PSEUDO!",
+ [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+ "#ACQUIRE_MOV64rm PSEUDO!",
+ [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+ (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+ (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+ (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+ (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper mcsym:$src2)),
+ (ADD32ri GR32:$src1, mcsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+ (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper mcsym:$src)), addr:$dst),
+ (MOV32mi addr:$dst, mcsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'. FIXME: This is really a hack, the
+// 'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri tconstpool :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri tjumptable :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+ (MOV64ri mcsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+ (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, mcsym:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+ Requires<[NearData, IsStatic]>;
+
+def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+ (MOV64ri32 tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+ (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+ (X86tcret node:$ptr, node:$off), [{
+ // X86tcret args: (*chain, ptr, imm, regs..., glue)
+ unsigned NumRegs = 0;
+ for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+ if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+ return false;
+ return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[Not64BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+ (TCRETURNmi addr:$dst, imm:$off)>,
+ Requires<[Not64BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+ Requires<[NotLP64]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>,
+ Requires<[NotLP64]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+ (TCRETURNmi64 addr:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+ Requires<[IsLP64]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+ Requires<[IsLP64]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+ (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+ (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+ Instruction Inst64> {
+ let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+ (Inst16 GR16:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+ (Inst32 GR32:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+ (Inst64 GR64:$src2, addr:$src1)>;
+ }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>;
+def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>;
+def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>;
+def : Pat<(zextloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0),
+ (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+ (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg &&
+ N->getOpcode() != ISD::AssertSext &&
+ N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies. However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read. To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+ APInt KnownZero0, KnownOne0;
+ CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+ APInt KnownZero1, KnownOne1;
+ CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+ return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+ Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "", // orw/addw REG, REG
+ [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "", // orl/addl REG, REG
+ [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "", // orq/addq REG, REG
+ [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "", // orw/addw REG, imm8
+ [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "", // orw/addw REG, imm
+ [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "", // orl/addl REG, imm8
+ [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "", // orl/addl REG, imm
+ [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "", // orq/addq REG, imm8
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "", // orq/addq REG, imm
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+ (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+ (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+ (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+ (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+// This can also reduce instruction size by eliminating the need for the REX
+// prefix.
+
+// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
+let AddedComplexity = 1 in {
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri8
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo8XForm imm:$imm))),
+ sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo32XForm imm:$imm))),
+ sub_32bit)>;
+} // AddedComplexity = 1
+
+
+// AddedComplexity is needed due to the increased complexity on the
+// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
+// the MOVZX patterns keeps thems together in DAGIsel tables.
+let AddedComplexity = 1 in {
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+ (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[Not64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+ (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+ sub_16bit)>,
+ Requires<[Not64BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+ (SUBREG_TO_REG (i64 0),
+ (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+ sub_32bit)>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+ sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+ sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+ Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (i8
+ (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+ Requires<[In64BitMode]>;
+} // AddedComplexity = 1
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+ (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+ (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+ sub_16bit)>,
+ Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+ (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+ (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+ (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+ Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (MOVSX32rr8
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+ Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+ (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+ (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32rr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[Not64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{
+ return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
+def immShift64 : ImmLeaf<i8, [{
+ return countTrailingOnes<uint64_t>(Imm) >= 6;
+}]>;
+
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+ // (shift x (and y, 31)) ==> (shift x, y)
+ def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+ def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+ def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+ def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+ // (shift x (and y, 63)) ==> (shift x, y)
+ def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+ def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+defm : MaskedShiftAmountPats<rotl, "ROL">;
+defm : MaskedShiftAmountPats<rotr, "ROR">;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+
+
+
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+ (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+ (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+ (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+ (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+ (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+ (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+ (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+ (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+ (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+ (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+ (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+ (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+ (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+ (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+ (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+ (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+ (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+ (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+ (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+ (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+ (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment/Decrement reg.
+// Do not make INC/DEC if it is slow
+let Predicates = [NotSlowIncDec] in {
+ def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
+ def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
+ def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
+ def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
+ def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>;
+ def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
+ def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
+ def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+}
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+ (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+ (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+ (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+ (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+ (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+ (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+ (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+ (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+ (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+ (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+ (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+ (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+ (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+ (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+ (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+ (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+ (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+ (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+ (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+ (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+ (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+ (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+ (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 0000000..8c351a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,329 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+ def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret{l}", [(X86retflag 0)], IIC_RET>, OpSize32,
+ Requires<[Not64BitMode]>;
+ def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret{q}", [(X86retflag 0)], IIC_RET>, OpSize32,
+ Requires<[In64BitMode]>;
+ def RETW : I <0xC3, RawFrm, (outs), (ins),
+ "ret{w}",
+ [], IIC_RET>, OpSize16;
+ def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret{l}\t$amt",
+ [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+ Requires<[Not64BitMode]>;
+ def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret{q}\t$amt",
+ [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
+ def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+ "ret{w}\t$amt",
+ [], IIC_RET_IMM>, OpSize16;
+ def LRETL : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+ def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+ def LRETW : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+ def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+ def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+ def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+
+ // The machine return from interrupt instruction, but sometimes we need to
+ // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+ // which expands to include an SP adjustment if necessary.
+ def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+ OpSize16;
+ def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
+ IIC_IRET>, OpSize32;
+ def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
+ IIC_IRET>, Requires<[In64BitMode]>;
+ let isCodeGenOnly = 1 in
+ def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>;
+
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+ def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+ "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+ def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+ }
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+ multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+ def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
+ [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
+ [], IIC_Jcc>, OpSize16, TB;
+ def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
+ [], IIC_Jcc>, TB, OpSize32;
+ }
+ }
+}
+
+defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
+defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
+ // These are the 32-bit versions of this instruction for the asmparser. In
+ // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+ // jecxz.
+ let Uses = [CX] in
+ def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
+ Requires<[Not64BitMode]>;
+ let Uses = [ECX] in
+ def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
+
+ let Uses = [RCX] in
+ def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
+ Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+ [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJump]>;
+ def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+ [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+
+ def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+ [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJump]>;
+ def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+ [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+
+ def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+ [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>;
+ def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+ [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+ let Predicates = [Not64BitMode] in {
+ def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "ljmp{w}\t$seg, $off", [],
+ IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+ def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "ljmp{l}\t$seg, $off", [],
+ IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+ }
+ def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+ "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+ Sched<[WriteJump]>;
+
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+ "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
+ Sched<[WriteJumpLd]>;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+ "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
+ Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. ESP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Uses = [ESP] in {
+ def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i32imm_pcrel:$dst),
+ "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+ Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ let hasSideEffects = 0 in
+ def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+ (outs), (ins i16imm_pcrel:$dst),
+ "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+ Sched<[WriteJump]>;
+ def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+ "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+ OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+ "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
+ IIC_CALL_MEM>, OpSize16,
+ Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+ "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+ OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+ IIC_CALL_MEM>, OpSize32,
+ Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
+
+ let Predicates = [Not64BitMode] in {
+ def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "lcall{w}\t$seg, $off", [],
+ IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+ def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "lcall{l}\t$seg, $off", [],
+ IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+ }
+
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+ "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
+ Sched<[WriteJumpLd]>;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+ "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
+ Sched<[WriteJumpLd]>;
+ }
+
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ let Uses = [ESP] in {
+ def TCRETURNdi : PseudoI<(outs),
+ (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+ def TCRETURNri : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ let mayLoad = 1 in
+ def TCRETURNmi : PseudoI<(outs),
+ (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+ // FIXME: The should be pseudo instructions that are lowered when going to
+ // mcinst.
+ def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i32imm_pcrel:$dst),
+ "jmp\t$dst",
+ [], IIC_JMP_REL>;
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
+ let mayLoad = 1 in
+ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+ "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst),
+ "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+ Requires<[In64BitMode]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+ IIC_CALL_RI>,
+ Requires<[In64BitMode]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+ IIC_CALL_MEM>,
+ Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+ def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+ "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+ SchedRW = [WriteJump] in {
+ def TCRETURNdi64 : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ []>;
+ def TCRETURNri64 : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ let mayLoad = 1 in
+ def TCRETURNmi64 : PseudoI<(outs),
+ (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+ def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>;
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+ "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+ // Win64 wants jumps leaving the function to have a REX_W prefix.
+ let hasREX_WPrefix = 1 in {
+ def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i64i32imm_pcrel:$dst),
+ "rex64 jmp\t$dst", [], IIC_JMP_REL>;
+ def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+ "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 0000000..c4b2d6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,182 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+ let Defs = [AX], Uses = [AL] in
+ def CBW : I<0x98, RawFrm, (outs), (ins),
+ "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL)
+ let Defs = [EAX], Uses = [AX] in
+ def CWDE : I<0x98, RawFrm, (outs), (ins),
+ "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
+
+ let Defs = [AX,DX], Uses = [AX] in
+ def CWD : I<0x99, RawFrm, (outs), (ins),
+ "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
+ let Defs = [EAX,EDX], Uses = [EAX] in
+ def CDQ : I<0x99, RawFrm, (outs), (ins),
+ "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
+
+
+ let Defs = [RAX], Uses = [EAX] in
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", [], IIC_CBW>; // RAX = signext(EAX)
+
+ let Defs = [RAX,RDX], Uses = [RAX] in
+ def CQO : RI<0x99, RawFrm, (outs), (ins),
+ "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let hasSideEffects = 0 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+ TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+ TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+ OpSize32, Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+ OpSize32, TB, Sched<[WriteALULd]>;
+
+let hasSideEffects = 0 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+ TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+ TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+ OpSize32, Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+ TB, OpSize32, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+
+def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVSX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ [], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+ Sched<[WriteALU]>, Requires<[In64BitMode]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+ Sched<[WriteALULd]>, Requires<[In64BitMode]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALU]>;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALU]>;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 0000000..fd800cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,441 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
+// below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+// operands 1 and 3 (register forms only): *213* --> *231*;
+// operands 2 and 3 (register forms only): *213* --> *132*.
+// FMA*132*:
+// operands 1 and 2 (memory & register forms): *132* --> *231*;
+// operands 1 and 3 (register forms only): *132* --> *132*(no changes);
+// operands 2 and 3 (register forms only): *132* --> *213*.
+// FMA*231*:
+// operands 1 and 2 (memory & register forms): *231* --> *132*;
+// operands 1 and 3 (register forms only): *231* --> *213*;
+// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ ValueType OpVT128, ValueType OpVT256,
+ SDPatternOperator Op = null_frag> {
+ let usesCustomInserter = 1 in
+ def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+ VR128:$src1, VR128:$src3)))]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+ (MemFrag128 addr:$src3))))]>;
+
+ let usesCustomInserter = 1 in
+ def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+ VR256:$src3)))]>, VEX_L;
+
+ let mayLoad = 1 in
+ def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (Op VR256:$src2, VR256:$src1,
+ (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpcodeStr, string PackTy,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+ defm r213 : fma3p_rm<opc213,
+ !strconcat(OpcodeStr, "213", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+ defm r132 : fma3p_rm<opc132,
+ !strconcat(OpcodeStr, "132", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ defm r231 : fma3p_rm<opc231,
+ !strconcat(OpcodeStr, "231", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
+ loadv8f32, X86Fmadd, v4f32, v8f32>;
+ defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
+ loadv8f32, X86Fmsub, v4f32, v8f32>;
+ defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+ loadv4f32, loadv8f32, X86Fmaddsub,
+ v4f32, v8f32>;
+ defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+ loadv4f32, loadv8f32, X86Fmsubadd,
+ v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
+ loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+ defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
+ loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+ defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+ loadv2f64, loadv4f64, X86Fmaddsub,
+ v2f64, v4f64>, VEX_W;
+ defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+ loadv2f64, loadv4f64, X86Fmsubadd,
+ v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", loadv4f32,
+ loadv8f32, X86Fnmadd, v4f32, v8f32>;
+ defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", loadv4f32,
+ loadv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
+ loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+ defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+ loadv2f64, loadv4f64, X86Fnmsub, v2f64,
+ v4f64>, VEX_W;
+}
+
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requres an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+// -->
+// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode = null_frag> {
+ let usesCustomInserter = 1 in
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+ Operand memopr, RegisterClass RC> {
+ def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
+
+ let mayLoad = 1 in
+ def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memopr:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
+}
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy,
+ SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop> {
+ defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
+ defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
+ OpNode>;
+ defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
+}
+
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy,
+ RegisterClass RC, Operand memop> {
+ defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+ memop, RC>;
+ defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+ memop, RC>;
+ defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+ memop, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+ SDNode OpNode> {
+ let ExeDomain = SSEPackedSingle in
+ defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
+ FR32, f32mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+
+ let ExeDomain = SSEPackedDouble in
+ defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
+ FR64, f64mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
+ VEX_W;
+
+ // These patterns use the 123 ordering, instead of 213, even though
+ // they match the intrinsic to the 213 version of the instruction.
+ // This is because src1 is tied to dest, and the scalar intrinsics
+ // require the pass-through values to come from the first source
+ // operand, not the second.
+ def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int")
+ $src1, $src2, $src3), VR128)>;
+
+ def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int")
+ $src1, $src2, $src3), VR128)>;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+ int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+ int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+ int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+ int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+ PatFrag mem_frag> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+ def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+ def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_LIG;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+ ComplexPattern mem_cpat, Intrinsic Int> {
+let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in
+ def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+ def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+ mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+ def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, memop:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+} // isCodeGenOnly = 1
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT128, ValueType OpVT256,
+ PatFrag ld_frag128, PatFrag ld_frag256> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+ VEX_W, MemOp4;
+ def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+ (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
+ def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+ let isCommutable = 1 in
+ def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+ VEX_W, MemOp4, VEX_L;
+ def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+ (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
+ def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+ def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_L;
+} // isCodeGenOnly = 1
+}
+
+let ExeDomain = SSEPackedSingle in {
+ // Scalar Instructions
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+ fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfmadd_ss>;
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+ fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfmsub_ss>;
+ defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+ X86Fnmadd, loadf32>,
+ fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmadd_ss>;
+ defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+ X86Fnmsub, loadf32>,
+ fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmsub_ss>;
+ // Packed Instructions
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+ defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+ loadv4f32, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ // Scalar Instructions
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+ fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmadd_sd>;
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+ fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmsub_sd>;
+ defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+ X86Fnmadd, loadf64>,
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmadd_sd>;
+ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+ X86Fnmsub, loadf64>,
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmsub_sd>;
+ // Packed Instructions
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+ defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+ loadv2f64, loadv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..03ae211
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,729 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+ SDTCisVT<1, f80>]>;
+def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+ SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in { // Expanded after instruction selection.
+ def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+ def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+ [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+ [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here. The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values. These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information. These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+ [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+ bit Forward = 1> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpIf32<(outs RFP32:$dst),
+ (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+ (set RFP32:$dst,
+ (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
+def _Fp64m : FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+ (set RFP64:$dst,
+ (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+ (set RFP64:$dst,
+ (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+ !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+ !strconcat("f", asmstring, "{l}\t$src")>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+ (set RFP32:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+ (set RFP32:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+ (set RFP64:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+ (set RFP64:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+ (set RFP80:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+ (set RFP80:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+ !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+ !strconcat("fi", asmstring, "{l}\t$src")>;
+}
+
+let Defs = [FPSW] in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+let SchedRW = [WriteFAddLd] in {
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
+}
+let SchedRW = [WriteFMulLd] in {
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+}
+let SchedRW = [WriteFDivLd] in {
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
+}
+}
+
+class FPST0rInst<Format fp, string asm>
+ : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+ : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+ : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+let SchedRW = [WriteFAdd] in {
+def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t$op">;
+def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
+def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t$op">;
+def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t$op">;
+def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
+def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
+def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">;
+def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul] in {
+def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">;
+def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
+def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFDiv] in {
+def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t$op">;
+def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
+def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
+def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t$op">;
+def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+} // SchedRW
+
+def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+let SchedRW = [WriteFSqrt] in {
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+}
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
+
+let hasSideEffects = 0 in {
+def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand. Added for the
+// disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+def FBSTPm : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+ CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc, EFLAGS))]>;
+ def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+ CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc, EFLAGS))]>;
+ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+ CondMovFP,
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+ cc, EFLAGS))]>,
+ Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
+ "fcmovb\t{$op, %st(0)|st(0), $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
+ "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
+ "fcmove\t{$op, %st(0)|st(0), $op}">;
+def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
+ "fcmovu\t{$op, %st(0)|st(0), $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
+ "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
+ "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
+ "fcmovne\t{$op, %st(0)|st(0), $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
+ "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+ def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, hasSideEffects = 0 in {
+def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+ [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, hasSideEffects = 0 in {
+def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+ IIC_FLD>;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+ IIC_FLD>;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+ IIC_FLD80>;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+ IIC_FILD>;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+ IIC_FILD>;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+ IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+ IIC_FST>;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+ IIC_FST>;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+ IIC_FST>;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+ IIC_FST>;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+ IIC_FST80>;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+ IIC_FIST>;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+ IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+ IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+ IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+ IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+ IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+ IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
+ "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
+def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
+def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
+def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop
+ (outs), (ins), "fucompp", IIC_FUCOM>;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
+ "fcomi\t$reg", IIC_FCOMI>;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
+ "fcompi\t$reg", IIC_FCOMI>;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
+ (outs), (ins), "fnstsw\t{%ax|ax}",
+ [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+ [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+ Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
+ "ffree\t$reg", IIC_FFREE>;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+
+let Predicates = [HasFXSR] in {
+ def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+ def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+ IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+ def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+ IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+} // Predicates = [FeatureFXSR]
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
+ RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+ RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+ Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack. We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+ Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..e2fa295
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,948 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<7> val> {
+ bits<7> Value = val;
+}
+
+def Pseudo : Format<0>; def RawFrm : Format<1>;
+def AddRegFrm : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>;
+def MRMSrcMem : Format<6>; def RawFrmMemOffs : Format<7>;
+def RawFrmSrc : Format<8>; def RawFrmDst : Format<9>;
+def RawFrmDstSrc: Format<10>;
+def RawFrmImm8 : Format<11>;
+def RawFrmImm16 : Format<12>;
+def MRMXr : Format<14>; def MRMXm : Format<15>;
+def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>;
+def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>;
+def MRM6r : Format<22>; def MRM7r : Format<23>;
+def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>;
+def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>;
+def MRM6m : Format<30>; def MRM7m : Format<31>;
+def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>;
+def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>;
+def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>;
+def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>;
+def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>;
+def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>;
+def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>;
+def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>;
+def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>;
+def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>;
+def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>;
+def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>;
+def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>;
+def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>;
+def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>;
+def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>;
+def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>;
+def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>;
+def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>;
+def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>;
+def MRM_FF : Format<95>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<4> val> {
+ bits<4> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm8PCRel : ImmType<2>;
+def Imm16 : ImmType<3>;
+def Imm16PCRel : ImmType<4>;
+def Imm32 : ImmType<5>;
+def Imm32PCRel : ImmType<6>;
+def Imm32S : ImmType<7>;
+def Imm64 : ImmType<8>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+ bits<2> Value = val;
+}
+def GenericDomain : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt : Domain<3>;
+
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+ bits<3> Value = val;
+}
+def CD8VF : CD8VForm<0>; // v := VL
+def CD8VH : CD8VForm<1>; // v := VL/2
+def CD8VQ : CD8VForm<2>; // v := VL/4
+def CD8VO : CD8VForm<3>; // v := VL/8
+// The tuple (subvector) forms.
+def CD8VT1 : CD8VForm<4>; // v := 1
+def CD8VT2 : CD8VForm<5>; // v := 2
+def CD8VT4 : CD8VForm<6>; // v := 4
+def CD8VT8 : CD8VForm<7>; // v := 8
+
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PS : Prefix<1>;
+def PD : Prefix<2>;
+def XS : Prefix<3>;
+def XD : Prefix<4>;
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+ bits<3> Value = val;
+}
+def OB : Map<0>;
+def TB : Map<1>;
+def T8 : Map<2>;
+def TA : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+ bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX : Encoding<1>;
+def EncXOP : Encoding<2>;
+def EncEVEX : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+ bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
+// Address size for encodings that change based on mode.
+class AddressSize<bits<2> val> {
+ bits<2> Value = val;
+}
+def AdSizeX : AddressSize<0>; // Address size determined using addr operand.
+def AdSize16 : AddressSize<1>; // Encodes a 16-bit address.
+def AdSize32 : AddressSize<2>; // Encodes a 32-bit address.
+def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
+class AdSize16 { AddressSize AdSize = AdSize16; }
+class AdSize32 { AddressSize AdSize = AdSize32; }
+class AdSize64 { AddressSize AdSize = AdSize64; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class LOCK { bit hasLockPrefix = 1; }
+class REP { bit hasREPPrefix = 1; }
+class TB { Map OpMap = TB; }
+class T8 { Map OpMap = T8; }
+class TA { Map OpMap = TA; }
+class XOP8 { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class OBXS { Prefix OpPrefix = XS; }
+class PS : TB { Prefix OpPrefix = PS; }
+class PD : TB { Prefix OpPrefix = PD; }
+class XD : TB { Prefix OpPrefix = XD; }
+class XS : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class VEX { Encoding OpEnc = EncVEX; }
+class VEX_W { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; }
+class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
+class VEX_L { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
+class EVEX_CD8<int esize, CD8VForm form> {
+ int CD8_EltSize = !srl(esize, 3);
+ bits<3> CD8_Form = form.Value;
+}
+
+class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
+class MemOp4 { bit hasMemOp4Prefix = 1; }
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+ string AsmStr,
+ InstrItinClass itin,
+ Domain d = GenericDomain>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<7> FormBits = Form.Value;
+ ImmType ImmT = i;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ string AsmString = AsmStr;
+
+ // If this is a pseudo instruction, mark it isCodeGenOnly.
+ let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+ let Itinerary = itin;
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+ // isCodeGenonly. Needed to hide an ambiguous
+ // AsmString from the parser, but still disassemble.
+
+ OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+ // based on operand size of the mode?
+ bits<2> OpSizeBits = OpSize.Value;
+ AddressSize AdSize = AdSizeX; // Does this instruction's encoding change
+ // based on address size of the mode?
+ bits<2> AdSizeBits = AdSize.Value;
+
+ Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+ bits<3> OpPrefixBits = OpPrefix.Value;
+ Map OpMap = OB; // Which opcode map does this inst have?
+ bits<3> OpMapBits = OpMap.Value;
+ bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix?
+ FPFormat FPForm = NotFP; // What flavor of FP instruction is this?
+ bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
+ Domain ExeDomain = d;
+ bit hasREPPrefix = 0; // Does this inst have a REP prefix?
+ Encoding OpEnc = EncNormal; // Encoding used by this instruction
+ bits<2> OpEncBits = OpEnc.Value;
+ bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field?
+ bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
+ bit hasVEX_4VOp3 = 0; // Does this inst require the VEX.VVVV field to
+ // encode the third operand?
+ bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register
+ // to be encoded in a immediate field?
+ bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
+ bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
+ bit hasEVEX_K = 0; // Does this inst require masking?
+ bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field?
+ bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field?
+ bit hasEVEX_B = 0; // Does this inst set the EVEX_B field?
+ bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width.
+ // Declare it int rather than bits<4> so that all bits are defined when
+ // assigning to bits<7>.
+ int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes.
+ bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+ bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands
+ bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
+
+ bits<2> EVEX_LL;
+ let EVEX_LL{0} = hasVEX_L;
+ let EVEX_LL{1} = hasEVEX_L2;
+ // Vector size in bytes.
+ bits<7> VectSize = !shl(16, EVEX_LL);
+
+ // The scaling factor for AVX512's compressed displacement is either
+ // - the size of a power-of-two number of elements or
+ // - the size of a single element for broadcasts or
+ // - the total vector size divided by a power-of-two number.
+ // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+ bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+ !if (CD8_Form{2},
+ !shl(CD8_EltSize, CD8_Form{1-0}),
+ !if (hasEVEX_B,
+ CD8_EltSize,
+ !srl(VectSize, CD8_Form{1-0}))), 0);
+
+ // TSFlags layout should be kept in sync with X86BaseInfo.h.
+ let TSFlags{6-0} = FormBits;
+ let TSFlags{8-7} = OpSizeBits;
+ let TSFlags{10-9} = AdSizeBits;
+ let TSFlags{13-11} = OpPrefixBits;
+ let TSFlags{16-14} = OpMapBits;
+ let TSFlags{17} = hasREX_WPrefix;
+ let TSFlags{21-18} = ImmT.Value;
+ let TSFlags{24-22} = FPForm.Value;
+ let TSFlags{25} = hasLockPrefix;
+ let TSFlags{26} = hasREPPrefix;
+ let TSFlags{28-27} = ExeDomain.Value;
+ let TSFlags{30-29} = OpEncBits;
+ let TSFlags{38-31} = Opcode;
+ let TSFlags{39} = hasVEX_WPrefix;
+ let TSFlags{40} = hasVEX_4V;
+ let TSFlags{41} = hasVEX_4VOp3;
+ let TSFlags{42} = hasVEX_i8ImmReg;
+ let TSFlags{43} = hasVEX_L;
+ let TSFlags{44} = ignoresVEX_L;
+ let TSFlags{45} = hasEVEX_K;
+ let TSFlags{46} = hasEVEX_Z;
+ let TSFlags{47} = hasEVEX_L2;
+ let TSFlags{48} = hasEVEX_B;
+ // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+ let TSFlags{55-49} = CD8_Scale;
+ let TSFlags{56} = has3DNow0F0FOpcode;
+ let TSFlags{57} = hasMemOp4Prefix;
+ let TSFlags{58} = hasEVEX_RC;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+ let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+ let FPForm = fp;
+ let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+// their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+// Iseg16 - 16-bit segment selector, 16-bit offset
+// Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])))));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
+class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])))));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ [UseSSE2])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
+ [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with PS prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
+// VSSI - SSE1 instructions with XS prefix in AVX form.
+// VPSI - SSE1 instructions with PS prefix in AVX form, packed single.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+ Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// S2SI - SSE2 instructions with XS prefix.
+// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+// PDI - SSE2 instructions with PD prefix, packed double domain.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and PD prefix.
+// VSDI - SSE2 scalar instructions with XD prefix in AVX form.
+// VPDI - SSE2 vector instructions with PD prefix in AVX form,
+// packed double domain.
+// VS2I - SSE2 scalar instructions with PD prefix in AVX form.
+// S2I - SSE2 scalar instructions with PD prefix.
+// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+// MMX operands.
+// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+// MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+ Requires<[UseAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+ PD, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+ Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with PD prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+ Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+ Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 prefix.
+// SS3AI - SSSE3 instructions with TA prefix.
+// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
+ Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
+ Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+// SS48I - SSE 4.1 instructions with T8 prefix.
+// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+// SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[UseSSE42]>;
+
+// SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+// SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+// Instructions introduced in AVX (no SSE equivalent forms)
+//
+// AVX8I - AVX instructions with T8PD prefix.
+// AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+// Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+// AVX28I - AVX2 instructions with T8PD prefix.
+// AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAVX2]>;
+
+
+// AVX-512 Instruction Templates:
+// Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+// AVX5128I - AVX-512 instructions with T8PD prefix.
+// AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+// AVX512PDI - AVX-512 instructions with PD, double packed.
+// AVX512PSI - AVX-512 instructions with PS, single packed.
+// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+// AVX512XSI - AVX-512 instructions with XS prefix, generic domain.
+// AVX512BI - AVX-512 instructions with PD, int packed domain.
+// AVX512SI - AVX-512 scalar instructions with PD prefix.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAVX512]>;
+class AVX5128IBase : T8PD {
+ Domain ExeDomain = SSEPackedInt;
+}
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+ Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS,
+ Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+ Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+ Requires<[HasAVX512]>;
+class AVX512BIBase : PD {
+ Domain ExeDomain = SSEPackedInt;
+}
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+ Requires<[HasAVX512]>;
+class AVX512BIi8Base : PD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512XSIi8Base : XS {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512XDIi8Base : XD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512PSIi8Base : PS {
+ Domain ExeDomain = SSEPackedSingle;
+ ImmType ImmT = Imm8;
+}
+class AVX512PDIi8Base : PD {
+ Domain ExeDomain = SSEPackedDouble;
+ ImmType ImmT = Imm8;
+}
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAVX512]>;
+class AVX512AIi8Base : TAPD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+ Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+ Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+ Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ EVEX_4V, Requires<[HasAVX512]>;
+class AVX512FMA3Base : T8PD, EVEX_4V;
+
+class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = IIC_AES>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+ Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP9, Requires<[HasXOP]>;
+
+// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP8, Requires<[HasXOP]>;
+
+// XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 0000000..829cedd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,1037 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+ [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+// GPR to low word of MMX.
+def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def load_mvmmx : PatFrag<(ops node:$ptr),
+ (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
+def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+ SDTCisFP<1>, SDTCisVT<3, i8>,
+ SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+ SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRT", SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCP", SDTFPBinOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
+def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
+def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
+def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
+def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
+def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>;
+def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
+//def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>;
+def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
+ SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
+ SDTCisVT<1, v4i32>]>>;
+def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
+ SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
+ SDTCisVT<1, v4i32>]>>;
+def X86pshufb : SDNode<"X86ISD::PSHUFB",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86psadbw : SDNode<"X86ISD::PSADBW",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
+def X86andnp : SDNode<"X86ISD::ANDNP",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86psign : SDNode<"X86ISD::PSIGN",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86pextrb : SDNode<"X86ISD::PEXTRB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+ SDTCisPtrTy<2>]>>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+ SDTCisPtrTy<2>]>>;
+def X86pinsrb : SDNode<"X86ISD::PINSRB",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insertps : SDNode<"X86ISD::INSERTPS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
+def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext : SDNode<"X86ISD::VZEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86vsext : SDNode<"X86ISD::VSEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<0, 1>]>;
+
+def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>;
+def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>;
+def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+
+def X86trunc : SDNode<"X86ISD::TRUNC",
+ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86vfpext : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisFP<1>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisFP<1>,
+ SDTCisOpSmallerThanOp<0, 1>]>>;
+
+def X86fround: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86froundRnd: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisInt<3>]>>;
+
+def X86fpext : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86fpextRnd : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisOpSmallerThanOp<1, 0>,
+ SDTCisInt<3>]>>;
+
+def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
+def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>;
+def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
+def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86IntCmpMask : SDTypeProfile<1, 2,
+ [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC :
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCCRound :
+ SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
+ SDTCisInt<4>]>;
+def X86CmpMaskCCScalar :
+ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def X86CmpMaskCCScalarRound :
+ SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
+ SDTCisInt<4>]>;
+
+def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
+def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>;
+
+def X86vshl : SDNode<"X86ISD::VSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsrl : SDNode<"X86ISD::VSRL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsra : SDNode<"X86ISD::VSRA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+
+def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def X86vprot : SDNode<"X86ISD::VPROT",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86vproti : SDNode<"X86ISD::VPROTI",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>]>>;
+
+def X86vpshl : SDNode<"X86ISD::VPSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86vpsha : SDNode<"X86ISD::VPSHA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+
+def X86vpcom : SDNode<"X86ISD::VPCOM",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+def X86vpcomu : SDNode<"X86ISD::VPCOMU",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisVec<1>,
+ SDTCisSameAs<2, 1>]>;
+def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>;
+def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>;
+def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
+def X86mulhrs : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>;
+def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
+def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCVecEltisVT<0, i1>,
+ SDTCisSameNumEltsAs<0, 1>]>>;
+def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCVecEltisVT<0, i1>,
+ SDTCisSameNumEltsAs<0, 1>]>>;
+def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
+def X86pmuldq : SDNode<"X86ISD::PMULDQ",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
+
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+ SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+ SDTCisVT<4, i8>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameNumEltsAs<0,2>]>;
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisInt<2>, SDTCisInt<3>]>;
+
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+ SDTCisInt<0>, SDTCisInt<1>]>;
+
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
+
+def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
+ SDTCisVT<4, i8>]>;
+
+def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc.
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
+
+def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [ // fsqrt_round, fgetexp_round, etc.
+ SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>;
+def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
+ SDTCisVec<0>, SDTCisVT<2, i32>]>;
+def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+ SDTCisVec<0>, SDTCisVT<3, i32>]>;
+def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+ SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
+
+def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
+
+def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>;
+
+def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
+def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>;
+
+def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
+def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
+def X86VPermv : SDNode<"X86ISD::VPERMV",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
+def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+def X86VPermi2X : SDNode<"X86ISD::VPERMIV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
+def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
+def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>;
+def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
+def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>;
+def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisFP<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+ SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
+
+def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSubVecOfVec<1, 0>]>, []>;
+// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2.
+def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+ SDTypeProfile<1, 1, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>]>, []>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
+ SDTCisPtrTy<3>]>, []>;
+def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,
+ [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+ SDTCisPtrTy<2>]>, []>;
+
+def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+
+def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
+
+def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
+def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
+def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
+def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
+def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
+def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
+def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>;
+def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>;
+
+def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
+def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
+def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>;
+def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;
+def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>;
+def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>;
+
+def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>;
+def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>;
+def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>;
+def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>;
+def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>;
+def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>;
+
+def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>;
+def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>;
+def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>;
+
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>;
+def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>;
+def X86Reduces : SDNode<"X86ISD::VREDUCE", STDFp3SrcRm>;
+def X86GetMants : SDNode<"X86ISD::VGETMANT", STDFp3SrcRm>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+ SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+ SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+ SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+
+def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVT<3, i32>]>;
+
+def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+
+def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>,
+ SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+ SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>;
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
+ SDTCisInt<2>]>;
+def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCVecEltisVT<1, i64>,
+ SDTCisInt<2>]>;
+
+def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<1>, SDTCVecEltisVT<0, i32>,
+ SDTCisInt<2>]>;
+def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<1>, SDTCVecEltisVT<0, i64>,
+ SDTCisInt<2>]>;
+
+// Scalar
+def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>;
+def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>;
+
+def X86cvttss2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>;
+def X86cvttss2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>;
+def X86cvttsd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSDoubleToIntRnd>;
+def X86cvttsd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSDoubleToIntRnd>;
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86VFpToSintRnd : SDNode<"ISD::FP_TO_SINT", SDTVFPToIntRound>;
+def X86VFpToUintRnd : SDNode<"ISD::FP_TO_UINT", SDTVFPToIntRound>;
+def X86VFpToSlongRnd : SDNode<"ISD::FP_TO_SINT", SDTVFPToLongRound>;
+def X86VFpToUlongRnd : SDNode<"ISD::FP_TO_UINT", SDTVFPToLongRound>;
+
+def X86VSintToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVintToFPRound>;
+def X86VUintToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVintToFPRound>;
+def X86VSlongToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVlongToFPRound>;
+def X86VUlongToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVlongToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtps2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToIntRnd>;
+def X86cvtps2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToIntRnd>;
+def X86cvtpd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToIntRnd>;
+def X86cvtpd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToIntRnd>;
+
+// Vector without rounding mode
+def X86cvtps2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToInt>;
+def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>;
+def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>;
+def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>;
+
+def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>,
+ SDTCisFP<0>,
+ SDTCisVT<2, i32>]> >;
+
+def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisFP<1>, SDTCisVT<2, i32>,
+ SDTCisVT<3, i32>]> >;
+def X86vfpextRnd : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisFP<1>,
+ SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisOpSmallerThanOp<1, 0>,
+ SDTCisVT<2, i32>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisFP<1>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisVT<2, i32>]>>;
+
+def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements. These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+ SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+ SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+ let PrintMethod = "printf32mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86Mem32AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+ let PrintMethod = "printf64mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86Mem64AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 512-bit load pattern fragments
+def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
+def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
+def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+
+// These are needed to match a scalar load that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def loadf32_128 : PatFrag<(ops node:$ptr),
+ (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>;
+def loadf64_128 : PatFrag<(ops node:$ptr),
+ (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'X86vzload', but always requires 128-bit vector alignment.
+def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
+ return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+ (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+ (f64 (alignedload node:$ptr))>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+ (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+ (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+ (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+ (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+ (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+ (v4i64 (alignedload256 node:$ptr))>;
+
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+ (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+ (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64 : PatFrag<(ops node:$ptr),
+ (v8f64 (alignedload512 node:$ptr))>;
+def alignedloadv8i64 : PatFrag<(ops node:$ptr),
+ (v8i64 (alignedload512 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others. If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return Subtarget->hasSSEUnalignedMem()
+ || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+ (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+ (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
+
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
+
+def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i32);
+ return false;
+}]>;
+
+def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v8i32);
+ return false;
+}]>;
+
+def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i64);
+ return false;
+}]>;
+def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v4i64);
+ return false;
+}]>;
+def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v8i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v8i64);
+ return false;
+}]>;
+def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v16i32 ||
+ Mgt->getBasePtr().getValueType() == MVT::v16i32);
+ return false;
+}]>;
+
+def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v2i64 ||
+ Sc->getBasePtr().getValueType() == MVT::v2i64);
+ return false;
+}]>;
+
+def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v4i32 ||
+ Sc->getBasePtr().getValueType() == MVT::v4i32);
+ return false;
+}]>;
+
+def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v4i64 ||
+ Sc->getBasePtr().getValueType() == MVT::v4i64);
+ return false;
+}]>;
+
+def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v8i32 ||
+ Sc->getBasePtr().getValueType() == MVT::v8i32);
+ return false;
+}]>;
+
+def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v8i64 ||
+ Sc->getBasePtr().getValueType() == MVT::v8i64);
+ return false;
+}]>;
+def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+ return (Sc->getIndex().getValueType() == MVT::v16i32 ||
+ Sc->getBasePtr().getValueType() == MVT::v16i32);
+ return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+
+// 512-bit bitconvert pattern fragments
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def I8Imm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 8 bits.
+ return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{
+ return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm : SDNodeXForm<imm, [{
+ // Transformation function: imm >> 3
+ return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
+
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
+
+def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
+ return Load->getAlignment() >= 16;
+ return false;
+}]>;
+
+def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
+ return Load->getAlignment() >= 32;
+ return false;
+}]>;
+
+def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
+ return Load->getAlignment() >= 64;
+ return false;
+}]>;
+
+def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ return isa<MaskedLoadSDNode>(N);
+}]>;
+
+// masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
+ return Store->getAlignment() >= 16;
+ return false;
+}]>;
+
+def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
+ return Store->getAlignment() >= 32;
+ return false;
+}]>;
+
+def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
+ return Store->getAlignment() >= 64;
+ return false;
+}]>;
+
+def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
+ return isa<MaskedStoreSDNode>(N);
+}]>;
+
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def masked_truncstorevi8 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..246804e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,7330 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <limits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "X86GenInstrInfo.inc"
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+ cl::desc("Re-materialize load from stub in PIC mode"),
+ cl::init(false), cl::Hidden);
+
+enum {
+ // Select which memory operand is being unfolded.
+ // (stored in bits 0 - 3)
+ TB_INDEX_0 = 0,
+ TB_INDEX_1 = 1,
+ TB_INDEX_2 = 2,
+ TB_INDEX_3 = 3,
+ TB_INDEX_4 = 4,
+ TB_INDEX_MASK = 0xf,
+
+ // Do not insert the reverse map (MemOp -> RegOp) into the table.
+ // This may be needed because there is a many -> one mapping.
+ TB_NO_REVERSE = 1 << 4,
+
+ // Do not insert the forward map (RegOp -> MemOp) into the table.
+ // This is needed for Native Client, which prohibits branch
+ // instructions from using a memory operand.
+ TB_NO_FORWARD = 1 << 5,
+
+ TB_FOLDED_LOAD = 1 << 6,
+ TB_FOLDED_STORE = 1 << 7,
+
+ // Minimum alignment required for load/store.
+ // Used for RegOp->MemOp conversion.
+ // (stored in bits 8 - 15)
+ TB_ALIGN_SHIFT = 8,
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86MemoryFoldTableEntry {
+ uint16_t RegOp;
+ uint16_t MemOp;
+ uint16_t Flags;
+};
+
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+ : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+ : X86::ADJCALLSTACKDOWN32),
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+ : X86::ADJCALLSTACKUP32),
+ X86::CATCHRET),
+ Subtarget(STI), RI(STI.getTargetTriple()) {
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADC32ri, X86::ADC32mi, 0 },
+ { X86::ADC32ri8, X86::ADC32mi8, 0 },
+ { X86::ADC32rr, X86::ADC32mr, 0 },
+ { X86::ADC64ri32, X86::ADC64mi32, 0 },
+ { X86::ADC64ri8, X86::ADC64mi8, 0 },
+ { X86::ADC64rr, X86::ADC64mr, 0 },
+ { X86::ADD16ri, X86::ADD16mi, 0 },
+ { X86::ADD16ri8, X86::ADD16mi8, 0 },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16rr, X86::ADD16mr, 0 },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri, X86::ADD32mi, 0 },
+ { X86::ADD32ri8, X86::ADD32mi8, 0 },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32mr, 0 },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32, X86::ADD64mi32, 0 },
+ { X86::ADD64ri8, X86::ADD64mi8, 0 },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64mr, 0 },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri, X86::ADD8mi, 0 },
+ { X86::ADD8rr, X86::ADD8mr, 0 },
+ { X86::AND16ri, X86::AND16mi, 0 },
+ { X86::AND16ri8, X86::AND16mi8, 0 },
+ { X86::AND16rr, X86::AND16mr, 0 },
+ { X86::AND32ri, X86::AND32mi, 0 },
+ { X86::AND32ri8, X86::AND32mi8, 0 },
+ { X86::AND32rr, X86::AND32mr, 0 },
+ { X86::AND64ri32, X86::AND64mi32, 0 },
+ { X86::AND64ri8, X86::AND64mi8, 0 },
+ { X86::AND64rr, X86::AND64mr, 0 },
+ { X86::AND8ri, X86::AND8mi, 0 },
+ { X86::AND8rr, X86::AND8mr, 0 },
+ { X86::DEC16r, X86::DEC16m, 0 },
+ { X86::DEC32r, X86::DEC32m, 0 },
+ { X86::DEC64r, X86::DEC64m, 0 },
+ { X86::DEC8r, X86::DEC8m, 0 },
+ { X86::INC16r, X86::INC16m, 0 },
+ { X86::INC32r, X86::INC32m, 0 },
+ { X86::INC64r, X86::INC64m, 0 },
+ { X86::INC8r, X86::INC8m, 0 },
+ { X86::NEG16r, X86::NEG16m, 0 },
+ { X86::NEG32r, X86::NEG32m, 0 },
+ { X86::NEG64r, X86::NEG64m, 0 },
+ { X86::NEG8r, X86::NEG8m, 0 },
+ { X86::NOT16r, X86::NOT16m, 0 },
+ { X86::NOT32r, X86::NOT32m, 0 },
+ { X86::NOT64r, X86::NOT64m, 0 },
+ { X86::NOT8r, X86::NOT8m, 0 },
+ { X86::OR16ri, X86::OR16mi, 0 },
+ { X86::OR16ri8, X86::OR16mi8, 0 },
+ { X86::OR16rr, X86::OR16mr, 0 },
+ { X86::OR32ri, X86::OR32mi, 0 },
+ { X86::OR32ri8, X86::OR32mi8, 0 },
+ { X86::OR32rr, X86::OR32mr, 0 },
+ { X86::OR64ri32, X86::OR64mi32, 0 },
+ { X86::OR64ri8, X86::OR64mi8, 0 },
+ { X86::OR64rr, X86::OR64mr, 0 },
+ { X86::OR8ri, X86::OR8mi, 0 },
+ { X86::OR8rr, X86::OR8mr, 0 },
+ { X86::ROL16r1, X86::ROL16m1, 0 },
+ { X86::ROL16rCL, X86::ROL16mCL, 0 },
+ { X86::ROL16ri, X86::ROL16mi, 0 },
+ { X86::ROL32r1, X86::ROL32m1, 0 },
+ { X86::ROL32rCL, X86::ROL32mCL, 0 },
+ { X86::ROL32ri, X86::ROL32mi, 0 },
+ { X86::ROL64r1, X86::ROL64m1, 0 },
+ { X86::ROL64rCL, X86::ROL64mCL, 0 },
+ { X86::ROL64ri, X86::ROL64mi, 0 },
+ { X86::ROL8r1, X86::ROL8m1, 0 },
+ { X86::ROL8rCL, X86::ROL8mCL, 0 },
+ { X86::ROL8ri, X86::ROL8mi, 0 },
+ { X86::ROR16r1, X86::ROR16m1, 0 },
+ { X86::ROR16rCL, X86::ROR16mCL, 0 },
+ { X86::ROR16ri, X86::ROR16mi, 0 },
+ { X86::ROR32r1, X86::ROR32m1, 0 },
+ { X86::ROR32rCL, X86::ROR32mCL, 0 },
+ { X86::ROR32ri, X86::ROR32mi, 0 },
+ { X86::ROR64r1, X86::ROR64m1, 0 },
+ { X86::ROR64rCL, X86::ROR64mCL, 0 },
+ { X86::ROR64ri, X86::ROR64mi, 0 },
+ { X86::ROR8r1, X86::ROR8m1, 0 },
+ { X86::ROR8rCL, X86::ROR8mCL, 0 },
+ { X86::ROR8ri, X86::ROR8mi, 0 },
+ { X86::SAR16r1, X86::SAR16m1, 0 },
+ { X86::SAR16rCL, X86::SAR16mCL, 0 },
+ { X86::SAR16ri, X86::SAR16mi, 0 },
+ { X86::SAR32r1, X86::SAR32m1, 0 },
+ { X86::SAR32rCL, X86::SAR32mCL, 0 },
+ { X86::SAR32ri, X86::SAR32mi, 0 },
+ { X86::SAR64r1, X86::SAR64m1, 0 },
+ { X86::SAR64rCL, X86::SAR64mCL, 0 },
+ { X86::SAR64ri, X86::SAR64mi, 0 },
+ { X86::SAR8r1, X86::SAR8m1, 0 },
+ { X86::SAR8rCL, X86::SAR8mCL, 0 },
+ { X86::SAR8ri, X86::SAR8mi, 0 },
+ { X86::SBB32ri, X86::SBB32mi, 0 },
+ { X86::SBB32ri8, X86::SBB32mi8, 0 },
+ { X86::SBB32rr, X86::SBB32mr, 0 },
+ { X86::SBB64ri32, X86::SBB64mi32, 0 },
+ { X86::SBB64ri8, X86::SBB64mi8, 0 },
+ { X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SHL16rCL, X86::SHL16mCL, 0 },
+ { X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32rCL, X86::SHL32mCL, 0 },
+ { X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64rCL, X86::SHL64mCL, 0 },
+ { X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8rCL, X86::SHL8mCL, 0 },
+ { X86::SHL8ri, X86::SHL8mi, 0 },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
+ { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
+ { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
+ { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
+ { X86::SHR16r1, X86::SHR16m1, 0 },
+ { X86::SHR16rCL, X86::SHR16mCL, 0 },
+ { X86::SHR16ri, X86::SHR16mi, 0 },
+ { X86::SHR32r1, X86::SHR32m1, 0 },
+ { X86::SHR32rCL, X86::SHR32mCL, 0 },
+ { X86::SHR32ri, X86::SHR32mi, 0 },
+ { X86::SHR64r1, X86::SHR64m1, 0 },
+ { X86::SHR64rCL, X86::SHR64mCL, 0 },
+ { X86::SHR64ri, X86::SHR64mi, 0 },
+ { X86::SHR8r1, X86::SHR8m1, 0 },
+ { X86::SHR8rCL, X86::SHR8mCL, 0 },
+ { X86::SHR8ri, X86::SHR8mi, 0 },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
+ { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
+ { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
+ { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
+ { X86::SUB16ri, X86::SUB16mi, 0 },
+ { X86::SUB16ri8, X86::SUB16mi8, 0 },
+ { X86::SUB16rr, X86::SUB16mr, 0 },
+ { X86::SUB32ri, X86::SUB32mi, 0 },
+ { X86::SUB32ri8, X86::SUB32mi8, 0 },
+ { X86::SUB32rr, X86::SUB32mr, 0 },
+ { X86::SUB64ri32, X86::SUB64mi32, 0 },
+ { X86::SUB64ri8, X86::SUB64mi8, 0 },
+ { X86::SUB64rr, X86::SUB64mr, 0 },
+ { X86::SUB8ri, X86::SUB8mi, 0 },
+ { X86::SUB8rr, X86::SUB8mr, 0 },
+ { X86::XOR16ri, X86::XOR16mi, 0 },
+ { X86::XOR16ri8, X86::XOR16mi8, 0 },
+ { X86::XOR16rr, X86::XOR16mr, 0 },
+ { X86::XOR32ri, X86::XOR32mi, 0 },
+ { X86::XOR32ri8, X86::XOR32mi8, 0 },
+ { X86::XOR32rr, X86::XOR32mr, 0 },
+ { X86::XOR64ri32, X86::XOR64mi32, 0 },
+ { X86::XOR64ri8, X86::XOR64mi8, 0 },
+ { X86::XOR64rr, X86::XOR64mr, 0 },
+ { X86::XOR8ri, X86::XOR8mi, 0 },
+ { X86::XOR8rr, X86::XOR8mr, 0 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
+ AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 0, folded load and store, no alignment requirement.
+ Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+ { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
+ { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
+ { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
+ { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
+ { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
+ { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
+ { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
+ { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
+ { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
+ { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
+ { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
+ { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
+ { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
+ { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
+ { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
+ { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
+ { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
+ { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
+ { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
+ { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
+ { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
+ { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
+ { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
+ { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
+ { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
+ { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
+ { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
+ { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
+ { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
+ { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
+ { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
+ { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
+ { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
+ { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
+ { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
+ { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
+ { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+ { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
+ { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
+ { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
+ { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
+ { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
+ { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
+ { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
+ { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
+ { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
+ { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
+ { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
+ { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
+ { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
+ { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
+ { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
+ { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
+ { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
+ { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
+ { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
+ { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
+ { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
+ { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
+ { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
+ { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
+ { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
+ { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
+ { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
+ { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
+ { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
+ { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
+
+ // AVX 128-bit versions of foldable instructions
+ { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+ { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
+ { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
+
+ // AVX 256-bit foldable instructions
+ { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
+
+ // AVX-512 foldable instructions
+ { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
+
+ // AVX-512 foldable instructions (256-bit versions)
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
+
+ // AVX-512 foldable instructions (128-bit versions)
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
+
+ // F16C foldable instructions
+ { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
+ AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+ { X86::BSF16rr, X86::BSF16rm, 0 },
+ { X86::BSF32rr, X86::BSF32rm, 0 },
+ { X86::BSF64rr, X86::BSF64rm, 0 },
+ { X86::BSR16rr, X86::BSR16rm, 0 },
+ { X86::BSR32rr, X86::BSR32rm, 0 },
+ { X86::BSR64rr, X86::BSR64rm, 0 },
+ { X86::CMP16rr, X86::CMP16rm, 0 },
+ { X86::CMP32rr, X86::CMP32rm, 0 },
+ { X86::CMP64rr, X86::CMP64rm, 0 },
+ { X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
+ { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
+ { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
+ { X86::IMUL16rri, X86::IMUL16rmi, 0 },
+ { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
+ { X86::IMUL32rri, X86::IMUL32rmi, 0 },
+ { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 },
+ { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
+ { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
+ { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 },
+ { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 },
+ { X86::MOV16rr, X86::MOV16rm, 0 },
+ { X86::MOV32rr, X86::MOV32rm, 0 },
+ { X86::MOV64rr, X86::MOV64rm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV8rr, X86::MOV8rm, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
+ { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
+ { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
+ { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
+ { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
+ { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
+ { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
+ { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
+ { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
+ { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
+ { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 },
+ { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
+ { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
+ { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
+ { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
+ { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
+ { X86::RCPSSr, X86::RCPSSm, 0 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 },
+ { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
+ { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
+ { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
+ { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
+ { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
+ { X86::SQRTSDr, X86::SQRTSDm, 0 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
+ { X86::SQRTSSr, X86::SQRTSSm, 0 },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 },
+ { X86::TEST16rr, X86::TEST16rm, 0 },
+ { X86::TEST32rr, X86::TEST32rm, 0 },
+ { X86::TEST64rr, X86::TEST64rm, 0 },
+ { X86::TEST8rr, X86::TEST8rm, 0 },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
+ { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
+
+ // MMX version of foldable instructions
+ { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
+ { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
+ { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
+ { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
+ { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
+ { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
+ { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
+ { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
+ { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
+ { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
+
+ // 3DNow! version of foldable instructions
+ { X86::PF2IDrr, X86::PF2IDrm, 0 },
+ { X86::PF2IWrr, X86::PF2IWrm, 0 },
+ { X86::PFRCPrr, X86::PFRCPrm, 0 },
+ { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
+ { X86::PI2FDrr, X86::PI2FDrm, 0 },
+ { X86::PI2FWrr, X86::PI2FWrm, 0 },
+ { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
+
+ // AVX 128-bit versions of foldable instructions
+ { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
+ { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
+ { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 },
+ { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 },
+ { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
+ { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
+ { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 },
+ { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
+ { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
+ { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 },
+ { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
+ { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 },
+ { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 },
+ { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
+ { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
+ { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
+ { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
+ { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
+ { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
+ { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
+ { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
+ { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
+ { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
+ { X86::VPABSBrr128, X86::VPABSBrm128, 0 },
+ { X86::VPABSDrr128, X86::VPABSDrm128, 0 },
+ { X86::VPABSWrr128, X86::VPABSWrm128, 0 },
+ { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
+ { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
+ { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
+ { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
+ { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 },
+ { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
+ { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
+ { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VPTESTrr, X86::VPTESTrm, 0 },
+ { X86::VRCPPSr, X86::VRCPPSm, 0 },
+ { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
+ { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
+ { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
+ { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
+ { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
+ { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
+ { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
+ { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
+
+ // AVX 256-bit foldable instructions
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
+ { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
+ { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
+ { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
+ { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
+ { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
+ { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
+ { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
+ { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
+ { X86::VPTESTYrr, X86::VPTESTYrm, 0 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
+ { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
+ { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
+ { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
+ { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
+
+ // AVX2 foldable instructions
+
+ // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
+ // VBROADCASTS{SD}rm memory instructions were available from AVX1.
+ // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
+ // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
+ // so they don't need an equivalent limitation.
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
+ { X86::VPABSBrr256, X86::VPABSBrm256, 0 },
+ { X86::VPABSDrr256, X86::VPABSDrm256, 0 },
+ { X86::VPABSWrr256, X86::VPABSWrm256, 0 },
+ { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 },
+ { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 },
+ { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 },
+ { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 },
+ { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 },
+ { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 },
+ { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 },
+ { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 },
+ { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
+ { X86::VPERMQYri, X86::VPERMQYmi, 0 },
+ { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 },
+ { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 },
+ { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
+ { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
+ { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
+ { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 },
+ { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 },
+ { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 },
+ { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
+ { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
+ { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
+ { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
+
+ // XOP foldable instructions
+ { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
+ { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
+ { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
+ { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
+ { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
+ { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
+ { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
+ { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
+ { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
+ { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
+ { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
+ { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
+ { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
+ { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
+ { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
+ { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
+ { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
+ { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
+ { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
+ { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
+ { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
+ { X86::VPROTBri, X86::VPROTBmi, 0 },
+ { X86::VPROTBrr, X86::VPROTBmr, 0 },
+ { X86::VPROTDri, X86::VPROTDmi, 0 },
+ { X86::VPROTDrr, X86::VPROTDmr, 0 },
+ { X86::VPROTQri, X86::VPROTQmi, 0 },
+ { X86::VPROTQrr, X86::VPROTQmr, 0 },
+ { X86::VPROTWri, X86::VPROTWmi, 0 },
+ { X86::VPROTWrr, X86::VPROTWmr, 0 },
+ { X86::VPSHABrr, X86::VPSHABmr, 0 },
+ { X86::VPSHADrr, X86::VPSHADmr, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQmr, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWmr, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBmr, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDmr, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQmr, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWmr, 0 },
+
+ // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
+ { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
+ { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
+ { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
+ { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
+ { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
+ { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
+ { X86::BLCI32rr, X86::BLCI32rm, 0 },
+ { X86::BLCI64rr, X86::BLCI64rm, 0 },
+ { X86::BLCIC32rr, X86::BLCIC32rm, 0 },
+ { X86::BLCIC64rr, X86::BLCIC64rm, 0 },
+ { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
+ { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
+ { X86::BLCS32rr, X86::BLCS32rm, 0 },
+ { X86::BLCS64rr, X86::BLCS64rm, 0 },
+ { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
+ { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
+ { X86::BLSI32rr, X86::BLSI32rm, 0 },
+ { X86::BLSI64rr, X86::BLSI64rm, 0 },
+ { X86::BLSIC32rr, X86::BLSIC32rm, 0 },
+ { X86::BLSIC64rr, X86::BLSIC64rm, 0 },
+ { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
+ { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
+ { X86::BLSR32rr, X86::BLSR32rm, 0 },
+ { X86::BLSR64rr, X86::BLSR64rm, 0 },
+ { X86::BZHI32rr, X86::BZHI32rm, 0 },
+ { X86::BZHI64rr, X86::BZHI64rm, 0 },
+ { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
+ { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
+ { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
+ { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
+ { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
+ { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
+ { X86::RORX32ri, X86::RORX32mi, 0 },
+ { X86::RORX64ri, X86::RORX64mi, 0 },
+ { X86::SARX32rr, X86::SARX32rm, 0 },
+ { X86::SARX64rr, X86::SARX64rm, 0 },
+ { X86::SHRX32rr, X86::SHRX32rm, 0 },
+ { X86::SHRX64rr, X86::SHRX64rm, 0 },
+ { X86::SHLX32rr, X86::SHLX32rm, 0 },
+ { X86::SHLX64rr, X86::SHLX64rm, 0 },
+ { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
+ { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
+ { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
+ { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
+ { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
+ { X86::TZMSK32rr, X86::TZMSK32rm, 0 },
+ { X86::TZMSK64rr, X86::TZMSK64rm, 0 },
+
+ // AVX-512 foldable instructions
+ { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
+ { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
+ { X86::VPABSDZrr, X86::VPABSDZrm, 0 },
+ { X86::VPABSQZrr, X86::VPABSQZrm, 0 },
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+
+ // AVX-512 foldable instructions (256-bit versions)
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+
+ // AVX-512 foldable instructions (256-bit versions)
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+
+ // F16C foldable instructions
+ { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
+ { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
+
+ // AES foldable instructions
+ { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
+ { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
+ { X86::VAESIMCrr, X86::VAESIMCrm, 0 },
+ { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
+ AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 1, folded load
+ Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+ { X86::ADC32rr, X86::ADC32rm, 0 },
+ { X86::ADC64rr, X86::ADC64rm, 0 },
+ { X86::ADD16rr, X86::ADD16rm, 0 },
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32rm, 0 },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64rm, 0 },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr, X86::ADD8rm, 0 },
+ { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
+ { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
+ { X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 },
+ { X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
+ { X86::AND16rr, X86::AND16rm, 0 },
+ { X86::AND32rr, X86::AND32rm, 0 },
+ { X86::AND64rr, X86::AND64rm, 0 },
+ { X86::AND8rr, X86::AND8rm, 0 },
+ { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
+ { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
+ { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
+ { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
+ { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
+ { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
+ { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
+ { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
+ { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
+ { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
+ { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
+ { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
+ { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
+ { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
+ { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
+ { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
+ { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
+ { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
+ { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
+ { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
+ { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
+ { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
+ { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
+ { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
+ { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
+ { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
+ { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
+ { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
+ { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
+ { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
+ { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
+ { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
+ { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
+ { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
+ { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
+ { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
+ { X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
+ { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
+ { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
+ { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
+ { X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 },
+ { X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
+ { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
+ { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
+
+ // Do not fold Fs* scalar logical op loads because there are no scalar
+ // load variants for these instructions. When folded, the load is required
+ // to be 128-bits, so the load size would not match.
+
+ { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
+ { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
+ { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 },
+ { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 },
+ { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 },
+ { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 },
+ { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 },
+ { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 },
+ { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
+ { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
+ { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
+ { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
+ { X86::IMUL16rr, X86::IMUL16rm, 0 },
+ { X86::IMUL32rr, X86::IMUL32rm, 0 },
+ { X86::IMUL64rr, X86::IMUL64rm, 0 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
+ { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
+ { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
+ { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
+ { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
+ { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
+ { X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
+ { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
+ { X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
+ { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
+ { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
+ { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
+ { X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },
+ { X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },
+ { X86::OR16rr, X86::OR16rm, 0 },
+ { X86::OR32rr, X86::OR32rm, 0 },
+ { X86::OR64rr, X86::OR64rm, 0 },
+ { X86::OR8rr, X86::OR8rm, 0 },
+ { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
+ { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
+ { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
+ { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
+ { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
+ { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
+ { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
+ { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
+ { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
+ { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
+ { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
+ { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 },
+ { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
+ { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
+ { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
+ { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
+ { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
+ { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
+ { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
+ { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
+ { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
+ { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
+ { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
+ { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
+ { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
+ { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
+ { X86::PINSRBrr, X86::PINSRBrm, 0 },
+ { X86::PINSRDrr, X86::PINSRDrm, 0 },
+ { X86::PINSRQrr, X86::PINSRQrm, 0 },
+ { X86::PINSRWrri, X86::PINSRWrmi, 0 },
+ { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
+ { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
+ { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
+ { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
+ { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
+ { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
+ { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
+ { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
+ { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
+ { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
+ { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
+ { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
+ { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
+ { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
+ { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
+ { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 },
+ { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
+ { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
+ { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
+ { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
+ { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
+ { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
+ { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
+ { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
+ { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 },
+ { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 },
+ { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 },
+ { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
+ { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
+ { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
+ { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
+ { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
+ { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
+ { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
+ { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
+ { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
+ { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
+ { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
+ { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
+ { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
+ { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
+ { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
+ { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
+ { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
+ { X86::SBB32rr, X86::SBB32rm, 0 },
+ { X86::SBB64rr, X86::SBB64rm, 0 },
+ { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
+ { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
+ { X86::SUB16rr, X86::SUB16rm, 0 },
+ { X86::SUB32rr, X86::SUB32rm, 0 },
+ { X86::SUB64rr, X86::SUB64rm, 0 },
+ { X86::SUB8rr, X86::SUB8rm, 0 },
+ { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
+ { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
+ { X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 },
+ { X86::SUBSSrr, X86::SUBSSrm, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 },
+ // FIXME: TEST*rr -> swapped operand of TEST*mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
+ { X86::XOR16rr, X86::XOR16rm, 0 },
+ { X86::XOR32rr, X86::XOR32rm, 0 },
+ { X86::XOR64rr, X86::XOR64rm, 0 },
+ { X86::XOR8rr, X86::XOR8rm, 0 },
+ { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
+ { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
+
+ // MMX version of foldable instructions
+ { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
+ { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
+ { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
+ { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
+ { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
+ { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
+ { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
+ { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
+ { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
+ { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
+ { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
+ { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
+ { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
+ { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
+ { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
+ { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
+ { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
+ { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
+ { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
+ { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
+ { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
+ { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
+ { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
+ { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
+ { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
+ { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
+ { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
+ { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
+ { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
+ { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
+ { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
+ { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
+ { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
+ { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
+ { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
+ { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
+ { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
+ { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
+ { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
+ { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
+ { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
+ { X86::MMX_PORirr, X86::MMX_PORirm, 0 },
+ { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
+ { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
+ { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
+ { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
+ { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
+ { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
+ { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
+ { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
+ { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
+ { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
+ { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
+ { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
+ { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
+ { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
+ { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
+ { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
+ { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
+ { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
+ { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
+ { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
+ { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
+ { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
+ { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
+ { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
+ { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
+ { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
+ { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
+ { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
+
+ // 3DNow! version of foldable instructions
+ { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
+ { X86::PFACCrr, X86::PFACCrm, 0 },
+ { X86::PFADDrr, X86::PFADDrm, 0 },
+ { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
+ { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
+ { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
+ { X86::PFMAXrr, X86::PFMAXrm, 0 },
+ { X86::PFMINrr, X86::PFMINrm, 0 },
+ { X86::PFMULrr, X86::PFMULrm, 0 },
+ { X86::PFNACCrr, X86::PFNACCrm, 0 },
+ { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
+ { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
+ { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
+ { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
+ { X86::PFSUBrr, X86::PFSUBrm, 0 },
+ { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
+ { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
+
+ // AVX 128-bit versions of foldable instructions
+ { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
+ { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 },
+ { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
+ { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
+ { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
+ { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
+ { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
+ { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
+ { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
+ { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
+ { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
+ { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 },
+ { X86::VADDPDrr, X86::VADDPDrm, 0 },
+ { X86::VADDPSrr, X86::VADDPSrm, 0 },
+ { X86::VADDSDrr, X86::VADDSDrm, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 },
+ { X86::VADDSSrr, X86::VADDSSrm, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 },
+ { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
+ { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
+ { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
+ { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
+ { X86::VANDPDrr, X86::VANDPDrm, 0 },
+ { X86::VANDPSrr, X86::VANDPSrm, 0 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
+ { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
+ { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
+ { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
+ { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
+ { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
+ { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
+ { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 },
+ { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
+ { X86::VDPPDrri, X86::VDPPDrmi, 0 },
+ { X86::VDPPSrri, X86::VDPPSrmi, 0 },
+ // Do not fold VFs* loads because there are no scalar load variants for
+ // these instructions. When folded, the load is required to be 128-bits, so
+ // the load size would not match.
+ { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 },
+ { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 },
+ { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 },
+ { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 },
+ { X86::VFvORPDrr, X86::VFvORPDrm, 0 },
+ { X86::VFvORPSrr, X86::VFvORPSrm, 0 },
+ { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 },
+ { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 },
+ { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
+ { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
+ { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
+ { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
+ { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 },
+ { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 },
+ { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
+ { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
+ { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 },
+ { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
+ { X86::VMINPDrr, X86::VMINPDrm, 0 },
+ { X86::VMINPSrr, X86::VMINPSrm, 0 },
+ { X86::VMINSDrr, X86::VMINSDrm, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
+ { X86::VMINSSrr, X86::VMINSSrm, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
+ { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
+ { X86::VMULPDrr, X86::VMULPDrm, 0 },
+ { X86::VMULPSrr, X86::VMULPSrm, 0 },
+ { X86::VMULSDrr, X86::VMULSDrm, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },
+ { X86::VMULSSrr, X86::VMULSSrm, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },
+ { X86::VORPDrr, X86::VORPDrm, 0 },
+ { X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
+ { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
+ { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
+ { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
+ { X86::VPADDBrr, X86::VPADDBrm, 0 },
+ { X86::VPADDDrr, X86::VPADDDrm, 0 },
+ { X86::VPADDQrr, X86::VPADDQrm, 0 },
+ { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
+ { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
+ { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
+ { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
+ { X86::VPADDWrr, X86::VPADDWrm, 0 },
+ { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 },
+ { X86::VPANDNrr, X86::VPANDNrm, 0 },
+ { X86::VPANDrr, X86::VPANDrm, 0 },
+ { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
+ { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
+ { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
+ { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
+ { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
+ { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
+ { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
+ { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
+ { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
+ { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
+ { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
+ { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPINSRBrr, X86::VPINSRBrm, 0 },
+ { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
+ { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
+ { X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
+ { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
+ { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
+ { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
+ { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
+ { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
+ { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
+ { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
+ { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
+ { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
+ { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
+ { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
+ { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
+ { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
+ { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
+ { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
+ { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 },
+ { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
+ { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
+ { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
+ { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
+ { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
+ { X86::VPORrr, X86::VPORrm, 0 },
+ { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
+ { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 },
+ { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 },
+ { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 },
+ { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
+ { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
+ { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
+ { X86::VPSRADrr, X86::VPSRADrm, 0 },
+ { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
+ { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
+ { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
+ { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
+ { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
+ { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
+ { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
+ { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
+ { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
+ { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
+ { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
+ { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
+ { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
+ { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
+ { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
+ { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
+ { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
+ { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
+ { X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
+ { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
+ { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
+ { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
+ { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 },
+ { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
+ { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
+ { X86::VXORPDrr, X86::VXORPDrm, 0 },
+ { X86::VXORPSrr, X86::VXORPSrm, 0 },
+
+ // AVX 256-bit foldable instructions
+ { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
+ { X86::VORPDYrr, X86::VORPDYrm, 0 },
+ { X86::VORPSYrr, X86::VORPSYrm, 0 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
+
+ // AVX2 foldable instructions
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
+ { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
+ { X86::VPANDYrr, X86::VPANDYrm, 0 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
+ { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
+ { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
+ { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
+ { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
+ { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
+ { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
+ { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
+ { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
+ { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
+ { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
+ { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
+ { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
+ { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
+ { X86::VPORYrr, X86::VPORYrm, 0 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
+ { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 },
+ { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 },
+ { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 },
+ { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
+ { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
+ { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
+ { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
+ { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
+ { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
+ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
+ { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
+ { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
+ { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
+ { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
+ { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
+ { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
+ { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
+ { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
+ { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
+ { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
+ { X86::VPXORYrr, X86::VPXORYrm, 0 },
+
+ // FMA4 foldable patterns
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE },
+
+ // XOP foldable instructions
+ { X86::VPCMOVrr, X86::VPCMOVmr, 0 },
+ { X86::VPCMOVrrY, X86::VPCMOVmrY, 0 },
+ { X86::VPCOMBri, X86::VPCOMBmi, 0 },
+ { X86::VPCOMDri, X86::VPCOMDmi, 0 },
+ { X86::VPCOMQri, X86::VPCOMQmi, 0 },
+ { X86::VPCOMWri, X86::VPCOMWmi, 0 },
+ { X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
+ { X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
+ { X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
+ { X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
+ { X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
+ { X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 },
+ { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
+ { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
+ { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
+ { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
+ { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
+ { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
+ { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
+ { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
+ { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
+ { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
+ { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
+ { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
+ { X86::VPPERMrr, X86::VPPERMmr, 0 },
+ { X86::VPROTBrr, X86::VPROTBrm, 0 },
+ { X86::VPROTDrr, X86::VPROTDrm, 0 },
+ { X86::VPROTQrr, X86::VPROTQrm, 0 },
+ { X86::VPROTWrr, X86::VPROTWrm, 0 },
+ { X86::VPSHABrr, X86::VPSHABrm, 0 },
+ { X86::VPSHADrr, X86::VPSHADrm, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQrm, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWrm, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBrm, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDrm, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQrm, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWrm, 0 },
+
+ // BMI/BMI2 foldable instructions
+ { X86::ANDN32rr, X86::ANDN32rm, 0 },
+ { X86::ANDN64rr, X86::ANDN64rm, 0 },
+ { X86::MULX32rr, X86::MULX32rm, 0 },
+ { X86::MULX64rr, X86::MULX64rm, 0 },
+ { X86::PDEP32rr, X86::PDEP32rm, 0 },
+ { X86::PDEP64rr, X86::PDEP64rm, 0 },
+ { X86::PEXT32rr, X86::PEXT32rm, 0 },
+ { X86::PEXT64rr, X86::PEXT64rm, 0 },
+
+ // ADX foldable instructions
+ { X86::ADCX32rr, X86::ADCX32rm, 0 },
+ { X86::ADCX64rr, X86::ADCX64rm, 0 },
+ { X86::ADOX32rr, X86::ADOX32rm, 0 },
+ { X86::ADOX64rr, X86::ADOX64rm, 0 },
+
+ // AVX-512 foldable instructions
+ { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
+ { X86::VADDPDZrr, X86::VADDPDZrm, 0 },
+ { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
+ { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
+ { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
+ { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
+ { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
+ { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
+ { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
+ { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
+ { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
+ { X86::VPADDDZrr, X86::VPADDDZrm, 0 },
+ { X86::VPADDQZrr, X86::VPADDQZrm, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
+ { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
+ { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
+ { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
+ { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
+ { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
+ { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
+ { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
+ { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
+ { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
+ { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
+ { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
+ { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
+ { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
+ { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
+ { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
+ { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
+
+ // AVX-512{F,VL} foldable instructions
+ { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+
+ // AVX-512{F,VL} foldable instructions
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
+
+ // AES foldable instructions
+ { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
+ { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
+ { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
+ { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
+ { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
+ { X86::VAESDECrr, X86::VAESDECrm, 0 },
+ { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
+ { X86::VAESENCrr, X86::VAESENCrm, 0 },
+
+ // SHA foldable instructions
+ { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
+ { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
+ { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
+ { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
+ { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
+ { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
+ { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
+ AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 2, folded load
+ Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+ // FMA foldable instructions
+ { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE },
+ { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE },
+ { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE },
+ { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE },
+ { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE },
+ { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE },
+
+ { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE },
+ { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE },
+ { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE },
+ { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE },
+ { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE },
+ { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE },
+ { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE },
+ { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE },
+ { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE },
+ { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE },
+ { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE },
+ { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE },
+
+ { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE },
+
+ { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE },
+ { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE },
+ { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE },
+ { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE },
+ { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE },
+ { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE },
+
+ { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE },
+
+ { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE },
+ { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE },
+ { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE },
+ { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE },
+ { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE },
+ { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE },
+
+ { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE },
+
+ { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE },
+
+ { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE },
+
+ { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE },
+
+ // FMA4 foldable patterns
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE },
+
+ // XOP foldable instructions
+ { X86::VPCMOVrr, X86::VPCMOVrm, 0 },
+ { X86::VPCMOVrrY, X86::VPCMOVrmY, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
+ { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
+ { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
+ { X86::VPPERMrr, X86::VPPERMrm, 0 },
+
+ // AVX-512 VPERMI instructions with 3 source operands.
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
+ { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
+ { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
+ { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
+ { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
+ // AVX-512 arithmetic instructions
+ { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
+ { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
+ { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
+ { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
+ { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
+ { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
+ { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
+ { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
+ { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
+ // AVX-512{F,VL} arithmetic instructions 256-bit
+ { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
+ { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
+ { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
+ { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
+ { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
+ { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
+ { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
+ { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
+ { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
+ // AVX-512{F,VL} arithmetic instructions 128-bit
+ { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
+ { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
+ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
+ { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
+ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
+ { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
+ { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
+ { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
+ { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 3, folded load
+ Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+
+ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+ // AVX-512 foldable instructions
+ { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
+ { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
+ { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
+ { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
+ { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
+ { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
+ { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
+ { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
+ { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
+ // AVX-512{F,VL} foldable instructions 256-bit
+ { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
+ { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
+ { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
+ { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
+ { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
+ { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
+ { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
+ { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
+ { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
+ // AVX-512{F,VL} foldable instructions 128-bit
+ { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
+ { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
+ { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
+ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
+ { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
+ { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
+ { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
+ { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
+ { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
+ { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }
+ };
+
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ Entry.RegOp, Entry.MemOp,
+ // Index 4, folded load
+ Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+ }
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ unsigned RegOp, unsigned MemOp, unsigned Flags) {
+ if ((Flags & TB_NO_FORWARD) == 0) {
+ assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+ R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+ }
+ if ((Flags & TB_NO_REVERSE) == 0) {
+ assert(!M2RTable.count(MemOp) &&
+ "Duplicated entries in unfolding maps?");
+ M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+ }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ if (!Subtarget.is64Bit())
+ // It's not always legal to reference the low 8-bit of the larger
+ // register in 32-bit mode.
+ return false;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVSX64rr32: {
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ // Be conservative.
+ return false;
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ SubIdx = X86::sub_8bit;
+ break;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ SubIdx = X86::sub_16bit;
+ break;
+ case X86::MOVSX64rr32:
+ SubIdx = X86::sub_32bit;
+ break;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+ if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+ MI->getOpcode() == getCallFrameDestroyOpcode()) {
+ unsigned StackAlign = TFI->getStackAlignment();
+ int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
+ StackAlign;
+
+ SPAdj -= MI->getOperand(1).getImm();
+
+ if (MI->getOpcode() == getCallFrameSetupOpcode())
+ return SPAdj;
+ else
+ return -SPAdj;
+ }
+
+ // To know whether a call adjusts the stack, we need information
+ // that is bound to the following ADJCALLSTACKUP pseudo.
+ // Look for the next ADJCALLSTACKUP that follows the call.
+ if (MI->isCall()) {
+ const MachineBasicBlock* MBB = MI->getParent();
+ auto I = ++MachineBasicBlock::const_iterator(MI);
+ for (auto E = MBB->end(); I != E; ++I) {
+ if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+ I->isCall())
+ break;
+ }
+
+ // If we could not find a frame destroy opcode, then it has already
+ // been simplified, so we don't care.
+ if (I->getOpcode() != getCallFrameDestroyOpcode())
+ return 0;
+
+ return -(I->getOperand(1).getImm());
+ }
+
+ // Currently handle only PUSHes we can reasonably expect to see
+ // in call sequences
+ switch (MI->getOpcode()) {
+ default:
+ return 0;
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ return 4;
+ }
+}
+
+/// Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+ int &FrameIndex) const {
+ if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
+ MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
+ MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+ MI->getOperand(Op+X86::AddrDisp).isImm() &&
+ MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
+ MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
+ MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
+ FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
+ return true;
+ }
+ return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQUYrm:
+ case X86::VMOVDQAYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ return true;
+ }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+ switch (Opcode) {
+ default: break;
+ case X86::MOV8mr:
+ case X86::MOV16mr:
+ case X86::MOV32mr:
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSSmr:
+ case X86::MOVSDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDmr:
+ case X86::MOVDQAmr:
+ case X86::VMOVSSmr:
+ case X86::VMOVSDmr:
+ case X86::VMOVAPSmr:
+ case X86::VMOVAPDmr:
+ case X86::VMOVDQAmr:
+ case X86::VMOVUPSYmr:
+ case X86::VMOVAPSYmr:
+ case X86::VMOVUPDYmr:
+ case X86::VMOVAPDYmr:
+ case X86::VMOVDQUYmr:
+ case X86::VMOVDQAYmr:
+ case X86::VMOVUPSZmr:
+ case X86::VMOVAPSZmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ return true;
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI->getOpcode()))
+ if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+ return MI->getOperand(0).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI->getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI->getOpcode()))
+ if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+ isFrameOperand(MI, 0, FrameIndex))
+ return MI->getOperand(X86::AddrNumOperands).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI->getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+ // Don't waste compile time scanning use-def chains of physregs.
+ if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+ return false;
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+ E = MRI.def_instr_end(); I != E; ++I) {
+ MachineInstr *DefMI = &*I;
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+ AliasAnalysis *AA) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsVMOVAPSrm:
+ case X86::FsVMOVAPDrm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ // AVX-512
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQU64Zrm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZrm: {
+ // Loads from constant pools are trivially rematerializable.
+ if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
+ MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+ MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+ MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+ MI->isInvariantLoad(AA)) {
+ unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+ if (BaseReg == 0 || BaseReg == X86::RIP)
+ return true;
+ // Allow re-materialization of PIC load.
+ if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
+ return false;
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+
+ case X86::LEA32r:
+ case X86::LEA64r: {
+ if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+ MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+ MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+ !MI->getOperand(1+X86::AddrDisp).isReg()) {
+ // lea fi#, lea GV, etc. are all rematerializable.
+ if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
+ return true;
+ unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of lea PICBase + x.
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+ }
+
+ // All other instructions marked M_REMATERIALIZABLE are always trivially
+ // rematerializable.
+ return true;
+}
+
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ MachineBasicBlock::iterator E = MBB.end();
+
+ // For compile time consideration, if we are not able to determine the
+ // safety after visiting 4 instructions in each direction, we will assume
+ // it's not safe.
+ MachineBasicBlock::iterator Iter = I;
+ for (unsigned i = 0; Iter != E && i < 4; ++i) {
+ bool SeenDef = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SeenDef = true;
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == X86::EFLAGS) {
+ if (MO.isUse())
+ return false;
+ SeenDef = true;
+ }
+ }
+
+ if (SeenDef)
+ // This instruction defines EFLAGS, no need to look any further.
+ return true;
+ ++Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != E && Iter->isDebugValue())
+ ++Iter;
+ }
+
+ // It is safe to clobber EFLAGS at the end of a block of no successor has it
+ // live in.
+ if (Iter == E) {
+ for (MachineBasicBlock *S : MBB.successors())
+ if (S->isLiveIn(X86::EFLAGS))
+ return false;
+ return true;
+ }
+
+ MachineBasicBlock::iterator B = MBB.begin();
+ Iter = I;
+ for (unsigned i = 0; i < 4; ++i) {
+ // If we make it to the beginning of the block, it's safe to clobber
+ // EFLAGS iff EFLAGS is not live-in.
+ if (Iter == B)
+ return !MBB.isLiveIn(X86::EFLAGS);
+
+ --Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != B && Iter->isDebugValue())
+ --Iter;
+
+ bool SawKill = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ // A register mask may clobber EFLAGS, but we should still look for a
+ // live EFLAGS def.
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SawKill = true;
+ if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+ if (MO.isDef()) return MO.isDead();
+ if (MO.isKill()) SawKill = true;
+ }
+ }
+
+ if (SawKill)
+ // This instruction kills EFLAGS and doesn't redefine it, so
+ // there's no need to look further.
+ return true;
+ }
+
+ // Conservative answer.
+ return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig,
+ const TargetRegisterInfo &TRI) const {
+ bool ClobbersEFLAGS = false;
+ for (const MachineOperand &MO : Orig->operands()) {
+ if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+ ClobbersEFLAGS = true;
+ break;
+ }
+ }
+
+ if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+ // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+ // effects.
+ int Value;
+ switch (Orig->getOpcode()) {
+ case X86::MOV32r0: Value = 0; break;
+ case X86::MOV32r1: Value = 1; break;
+ case X86::MOV32r_1: Value = -1; break;
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ }
+
+ DebugLoc DL = Orig->getDebugLoc();
+ BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
+ .addImm(Value);
+ } else {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+ MBB.insert(I, MI);
+ }
+
+ MachineInstr *NewMI = std::prev(I);
+ NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDef() &&
+ MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Check whether the shift count for a machine operand is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+ unsigned ShiftAmtOperandIdx) {
+ // The shift count is six bits with the REX.W prefix and five bits without.
+ unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+ unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+ return Imm & ShiftCountMask;
+}
+
+/// Check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+ // Left shift instructions can be transformed into load-effective-address
+ // instructions if we can encode them appropriately.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
+ // The SIB.scale field is two bits wide which means that we can encode any
+ // shift amount less than 4.
+ return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+ unsigned Opc, bool AllowSP,
+ unsigned &NewSrc, bool &isKill, bool &isUndef,
+ MachineOperand &ImplicitOp) const {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetRegisterClass *RC;
+ if (AllowSP) {
+ RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+ } else {
+ RC = Opc != X86::LEA32r ?
+ &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+ }
+ unsigned SrcReg = Src.getReg();
+
+ // For both LEA64 and LEA32 the register already has essentially the right
+ // type (32-bit or 64-bit) we may just need to forbid SP.
+ if (Opc != X86::LEA64_32r) {
+ NewSrc = SrcReg;
+ isKill = Src.isKill();
+ isUndef = Src.isUndef();
+
+ if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+ !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+ return false;
+
+ return true;
+ }
+
+ // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+ // another we need to add 64-bit registers to the final MI.
+ if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ ImplicitOp = Src;
+ ImplicitOp.setImplicit();
+
+ NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
+ MachineBasicBlock::LivenessQueryResult LQR =
+ MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+
+ switch (LQR) {
+ case MachineBasicBlock::LQR_Unknown:
+ // We can't give sane liveness flags to the instruction, abandon LEA
+ // formation.
+ return false;
+ case MachineBasicBlock::LQR_Live:
+ isKill = MI->killsRegister(SrcReg);
+ isUndef = false;
+ break;
+ default:
+ // The physreg itself is dead, so we have to use it as an <undef>.
+ isKill = false;
+ isUndef = true;
+ break;
+ }
+ } else {
+ // Virtual register of the wrong class, we have to create a temporary 64-bit
+ // vreg to feed into the LEA.
+ NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ get(TargetOpcode::COPY))
+ .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+ .addOperand(Src);
+
+ // Which is obviously going to be dead after we're done with it.
+ isKill = true;
+ isUndef = false;
+ }
+
+ // We've set all the parameters without issue.
+ return true;
+}
+
+/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
+/// LEA to form 3-address code by promoting to a 32-bit superregister and then
+/// truncating back down to a 16-bit subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isKill = MI->getOperand(1).isKill();
+
+ MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+ unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ unsigned Opc, leaInReg;
+ if (Subtarget.is64Bit()) {
+ Opc = X86::LEA64_32r;
+ leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ } else {
+ Opc = X86::LEA32r;
+ leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ }
+
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ // This has the potential to cause partial register stall. e.g.
+ // movw (%rbp,%rcx,2), %dx
+ // leal -65(%rdx), %esi
+ // But testing has shown this *does* help performance in 64-bit mode (at
+ // least on modern x86 machines).
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+ MachineInstr *InsMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+ .addReg(Src, getKillRegState(isKill));
+
+ MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+ get(Opc), leaOutReg);
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHL16ri: {
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ MIB.addReg(0).addImm(1 << ShAmt)
+ .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+ break;
+ }
+ case X86::INC16r:
+ addRegOffset(MIB, leaInReg, true, 1);
+ break;
+ case X86::DEC16r:
+ addRegOffset(MIB, leaInReg, true, -1);
+ break;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+ break;
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ unsigned leaInReg2 = 0;
+ MachineInstr *InsMI2 = nullptr;
+ if (Src == Src2) {
+ // ADD16rr %reg1028<kill>, %reg1028
+ // just a single insert_subreg.
+ addRegReg(MIB, leaInReg, true, leaInReg, false);
+ } else {
+ if (Subtarget.is64Bit())
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ else
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
+ InsMI2 =
+ BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+ .addReg(Src2, getKillRegState(isKill2));
+ addRegReg(MIB, leaInReg, true, leaInReg2, true);
+ }
+ if (LV && isKill2 && InsMI2)
+ LV->replaceKillInstruction(Src2, MI, InsMI2);
+ break;
+ }
+ }
+
+ MachineInstr *NewMI = MIB;
+ MachineInstr *ExtMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+ if (LV) {
+ // Update live variables
+ LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+ LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, InsMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, ExtMI);
+ }
+
+ return ExtMI;
+}
+
+/// This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return nullptr;
+
+ MachineFunction &MF = *MI->getParent()->getParent();
+ // All instructions input are two-addr instructions. Get the known operands.
+ const MachineOperand &Dest = MI->getOperand(0);
+ const MachineOperand &Src = MI->getOperand(1);
+
+ MachineInstr *NewMI = nullptr;
+ // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
+ // we have better subtarget support, enable the 16-bit LEA generation here.
+ // 16-bit LEA is also slow on Core2.
+ bool DisableLEA16 = true;
+ bool is64Bit = Subtarget.is64Bit();
+
+ unsigned MIOpc = MI->getOpcode();
+ switch (MIOpc) {
+ default: return nullptr;
+ case X86::SHL64ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ // LEA can't handle RSP.
+ if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(),
+ &X86::GR64_NOSPRegClass))
+ return nullptr;
+
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addOperand(Dest)
+ .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ // LEA can't handle ESP.
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addImm(0).addReg(0);
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+ NewMI = MIB;
+
+ break;
+ }
+ case X86::SHL16ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest)
+ .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+ break;
+ }
+ case X86::INC64r:
+ case X86::INC32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, 1);
+ break;
+ }
+ case X86::INC16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest).addOperand(Src), 1);
+ break;
+ case X86::DEC64r:
+ case X86::DEC32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, -1);
+
+ break;
+ }
+ case X86::DEC16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest).addOperand(Src), -1);
+ break;
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc;
+ if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+ Opc = X86::LEA64r;
+ else
+ Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return nullptr;
+
+ const MachineOperand &Src2 = MI->getOperand(2);
+ bool isKill2, isUndef2;
+ unsigned SrcReg2;
+ MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+ SrcReg2, isKill2, isUndef2, ImplicitOp2))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest);
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+ if (ImplicitOp2.getReg() != 0)
+ MIB.addOperand(ImplicitOp2);
+
+ NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+
+ // Preserve undefness of the operands.
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
+ if (LV && Src2.isKill())
+ LV->replaceKillInstruction(SrcReg2, MI, NewMI);
+ break;
+ }
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest),
+ Src.getReg(), Src.isKill(), Src2, isKill2);
+
+ // Preserve undefness of the operands.
+ bool isUndef = MI->getOperand(1).isUndef();
+ bool isUndef2 = MI->getOperand(2).isUndef();
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addOperand(Dest).addOperand(Src),
+ MI->getOperand(2).getImm());
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, MI->getOperand(2).getImm());
+ break;
+ }
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+ : nullptr;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest).addOperand(Src),
+ MI->getOperand(2).getImm());
+ break;
+ }
+
+ if (!NewMI) return nullptr;
+
+ if (LV) { // Update live variables
+ if (Src.isKill())
+ LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+ if (Dest.isDead())
+ LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
+ }
+
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+}
+
+/// Returns true if the given instruction opcode is FMA3.
+/// Otherwise, returns false.
+/// The second parameter is optional and is used as the second return from
+/// the function. It is set to true if the given instruction has FMA3 opcode
+/// that is used for lowering of scalar FMA intrinsics, and it is set to false
+/// otherwise.
+static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
+ if (IsIntrinsic)
+ *IsIntrinsic = false;
+
+ switch (Opcode) {
+ case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
+ case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
+ case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
+ case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
+ case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
+ case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
+ case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
+ case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:
+
+ case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
+ case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
+ case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
+ case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
+ case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
+ case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
+ case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
+ case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:
+
+ case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
+ case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
+ case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
+ case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
+ case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
+ case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
+ case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
+ case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:
+
+ case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
+ case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
+ case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
+ case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
+ case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
+ case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
+ case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
+ case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
+
+ case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
+ case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
+ case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
+ case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
+ case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
+ case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
+ case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
+ case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
+ case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
+ case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
+ case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
+ case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
+ case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
+ case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
+ case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
+ case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:
+
+ case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
+ case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
+ case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
+ case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
+ case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
+ case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
+ case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
+ case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
+
+ case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
+ case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
+ case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
+ case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
+ case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
+ case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
+ case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
+ case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
+ case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
+ case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
+ case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
+ case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
+ case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
+ case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
+ case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
+ case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:
+
+ case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
+ case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
+ case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
+ case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
+ case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
+ case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
+ case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
+ case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
+
+ case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
+ case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
+ case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
+ case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
+ case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
+ case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
+ case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
+ case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
+ case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
+ case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
+ case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
+ case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
+ case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
+ case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
+ case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
+ case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
+ return true;
+
+ case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int:
+ case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int:
+ case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int:
+ case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int:
+ case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
+ case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
+ case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
+ case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
+
+ case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int:
+ case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int:
+ case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int:
+ case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int:
+ case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
+ case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
+ case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
+ case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
+
+ case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int:
+ case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int:
+ case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int:
+ case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int:
+ case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
+ case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
+ case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
+ case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
+ if (IsIntrinsic)
+ *IsIntrinsic = true;
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("Opcode not handled by the switch");
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ switch (MI->getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+ case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+ case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+ }
+ unsigned Amt = MI->getOperand(3).getImm();
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ MI->getOperand(3).setImm(Size-Amt);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+ case X86::BLENDPDrri:
+ case X86::BLENDPSrri:
+ case X86::PBLENDWrri:
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPDYrri:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDrri:
+ case X86::VPBLENDWrri:
+ case X86::VPBLENDDYrri:
+ case X86::VPBLENDWYrri:{
+ unsigned Mask;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::BLENDPDrri: Mask = 0x03; break;
+ case X86::BLENDPSrri: Mask = 0x0F; break;
+ case X86::PBLENDWrri: Mask = 0xFF; break;
+ case X86::VBLENDPDrri: Mask = 0x03; break;
+ case X86::VBLENDPSrri: Mask = 0x0F; break;
+ case X86::VBLENDPDYrri: Mask = 0x0F; break;
+ case X86::VBLENDPSYrri: Mask = 0xFF; break;
+ case X86::VPBLENDDrri: Mask = 0x0F; break;
+ case X86::VPBLENDWrri: Mask = 0xFF; break;
+ case X86::VPBLENDDYrri: Mask = 0xFF; break;
+ case X86::VPBLENDWYrri: Mask = 0xFF; break;
+ }
+ // Only the least significant bits of Imm are used.
+ unsigned Imm = MI->getOperand(3).getImm() & Mask;
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->getOperand(3).setImm(Mask ^ Imm);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+ case X86::PCLMULQDQrr:
+ case X86::VPCLMULQDQrr:{
+ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
+ // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
+ unsigned Imm = MI->getOperand(3).getImm();
+ unsigned Src1Hi = Imm & 0x01;
+ unsigned Src2Hi = Imm & 0x10;
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+ case X86::CMPPDrri:
+ case X86::CMPPSrri:
+ case X86::VCMPPDrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSYrri: {
+ // Float comparison can be safely commuted for
+ // Ordered/Unordered/Equal/NotEqual tests
+ unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ default:
+ return nullptr;
+ }
+ }
+ case X86::VPCOMBri: case X86::VPCOMUBri:
+ case X86::VPCOMDri: case X86::VPCOMUDri:
+ case X86::VPCOMQri: case X86::VPCOMUQri:
+ case X86::VPCOMWri: case X86::VPCOMUWri: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ case 0x00: Imm = 0x02; break; // LT -> GT
+ case 0x01: Imm = 0x03; break; // LE -> GE
+ case 0x02: Imm = 0x00; break; // GT -> LT
+ case 0x03: Imm = 0x01; break; // GE -> LE
+ case 0x04: // EQ
+ case 0x05: // NE
+ case 0x06: // FALSE
+ case 0x07: // TRUE
+ default:
+ break;
+ }
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->getOperand(3).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+ case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
+ case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+ case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
+ case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+ case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+ case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
+ case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
+ case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+ case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+ case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
+ case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
+ case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+ case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
+ case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+ case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
+ case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
+ case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
+ case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
+ case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+ case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+ case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+ case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
+ case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
+ case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
+ case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+ case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+ case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+ case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+ case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+ case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+ case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
+ case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
+ case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
+ case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
+ case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
+ case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
+ case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+ case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+ case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+ case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+ case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+ case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+ case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
+ case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
+ case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
+ case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
+ case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
+ case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
+ case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+ case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+ case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+ case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
+ case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
+ case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
+ case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+ case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+ case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+ case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
+ case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
+ case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
+ case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+ case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+ case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+ }
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ // Fallthrough intended.
+ }
+ default:
+ if (isFMA3(MI->getOpcode())) {
+ unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+ if (Opc == 0)
+ return nullptr;
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ }
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+}
+
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+
+ unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+
+ // Only the first RegOpsNum operands are commutable.
+ // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+ // that the operand is not specified/fixed.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+ (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+ return false;
+ if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+ (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+ return false;
+
+ // Look for two different register operands assumed to be commutable
+ // regardless of the FMA opcode. The FMA opcode is adjusted later.
+ if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+ SrcOpIdx2 == CommuteAnyOperandIndex) {
+ unsigned CommutableOpIdx1 = SrcOpIdx1;
+ unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+ // At least one of operands to be commuted is not specified and
+ // this method is free to choose appropriate commutable operands.
+ if (SrcOpIdx1 == SrcOpIdx2)
+ // Both of operands are not fixed. By default set one of commutable
+ // operands to the last register operand of the instruction.
+ CommutableOpIdx2 = RegOpsNum;
+ else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+ // Only one of operands is not fixed.
+ CommutableOpIdx2 = SrcOpIdx1;
+
+ // CommutableOpIdx2 is well defined now. Let's choose another commutable
+ // operand and assign its index to CommutableOpIdx1.
+ unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+ for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+ // The commuted operands must have different registers.
+ // Otherwise, the commute transformation does not change anything and
+ // is useless then.
+ if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+ break;
+ }
+
+ // No appropriate commutable operands were found.
+ if (CommutableOpIdx1 == 0)
+ return false;
+
+ // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+ // to return those values.
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ }
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) const {
+ unsigned Opc = MI->getOpcode();
+
+ // Define the array that holds FMA opcodes in groups
+ // of 3 opcodes(132, 213, 231) in each group.
+ static const unsigned RegularOpcodeGroups[][3] = {
+ { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
+ { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
+ { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
+ { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
+ { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
+ { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
+ { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
+ { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
+ { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
+ { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
+ { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
+ { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },
+
+ { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
+ { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
+ { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
+ { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
+ { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
+ { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
+ { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
+ { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
+ { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
+ { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
+ { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
+ { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },
+
+ { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
+ { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
+ { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
+ { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
+ { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
+ { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
+ { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
+ { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
+ { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
+ { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
+ { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
+ { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
+
+ { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
+ { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
+ { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
+ { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
+ { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
+ { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
+ { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
+ { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
+ { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
+ { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
+ { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
+ { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
+
+ { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
+ { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
+ { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
+ { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
+ { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
+ { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
+ { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
+ { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
+
+ { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
+ { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
+ { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
+ { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
+ { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
+ { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
+ { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
+ { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
+ };
+
+ // Define the array that holds FMA*_Int opcodes in groups
+ // of 3 opcodes(132, 213, 231) in each group.
+ static const unsigned IntrinOpcodeGroups[][3] = {
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int },
+ { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int },
+ { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int },
+
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int },
+ { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int },
+ { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int },
+
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
+ { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
+ { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
+
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
+ { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
+ { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
+ };
+
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ const unsigned FormsNum = 3;
+
+ bool IsIntrinOpcode;
+ isFMA3(Opc, &IsIntrinOpcode);
+
+ size_t GroupsNum;
+ const unsigned (*OpcodeGroups)[3];
+ if (IsIntrinOpcode) {
+ GroupsNum = array_lengthof(IntrinOpcodeGroups);
+ OpcodeGroups = IntrinOpcodeGroups;
+ } else {
+ GroupsNum = array_lengthof(RegularOpcodeGroups);
+ OpcodeGroups = RegularOpcodeGroups;
+ }
+
+ const unsigned *FoundOpcodesGroup = nullptr;
+ size_t FormIndex;
+
+ // Look for the input opcode in the corresponding opcodes table.
+ for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+ ++GroupIndex) {
+ for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
+ if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
+ FoundOpcodesGroup = OpcodeGroups[GroupIndex];
+ break;
+ }
+ }
+ }
+
+ // The input opcode does not match with any of the opcodes from the tables.
+ // The unsupported FMA opcode must be added to one of the two opcode groups
+ // defined above.
+ assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (IsIntrinOpcode && SrcOpIdx1 == 1)
+ return 0;
+
+ unsigned Case;
+ if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+ Case = 0;
+ else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+ Case = 1;
+ else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+ Case = 2;
+ else
+ return 0;
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FoundOpcodesGroup[FormIndex];
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrri:
+ case X86::CMPPSrri:
+ case X86::VCMPPDrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSYrri: {
+ // Float comparison can be safely commuted for
+ // Ordered/Unordered/Equal/NotEqual tests
+ unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ // The indices of the commutable operands are 1 and 2.
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+ }
+ return false;
+ }
+ default:
+ if (isFMA3(MI->getOpcode()))
+ return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ }
+ return false;
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default: return X86::COND_INVALID;
+ case X86::JE_1: return X86::COND_E;
+ case X86::JNE_1: return X86::COND_NE;
+ case X86::JL_1: return X86::COND_L;
+ case X86::JLE_1: return X86::COND_LE;
+ case X86::JG_1: return X86::COND_G;
+ case X86::JGE_1: return X86::COND_GE;
+ case X86::JB_1: return X86::COND_B;
+ case X86::JBE_1: return X86::COND_BE;
+ case X86::JA_1: return X86::COND_A;
+ case X86::JAE_1: return X86::COND_AE;
+ case X86::JS_1: return X86::COND_S;
+ case X86::JNS_1: return X86::COND_NS;
+ case X86::JP_1: return X86::COND_P;
+ case X86::JNP_1: return X86::COND_NP;
+ case X86::JO_1: return X86::COND_O;
+ case X86::JNO_1: return X86::COND_NO;
+ }
+}
+
+/// Return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::SETAr: case X86::SETAm: return X86::COND_A;
+ case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+ case X86::SETBr: case X86::SETBm: return X86::COND_B;
+ case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+ case X86::SETEr: case X86::SETEm: return X86::COND_E;
+ case X86::SETGr: case X86::SETGm: return X86::COND_G;
+ case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+ case X86::SETLr: case X86::SETLm: return X86::COND_L;
+ case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+ case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+ case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+ case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+ case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+ case X86::SETOr: case X86::SETOm: return X86::COND_O;
+ case X86::SETPr: case X86::SETPm: return X86::COND_P;
+ case X86::SETSr: case X86::SETSm: return X86::COND_S;
+ }
+}
+
+/// Return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
+ case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
+ return X86::COND_A;
+ case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+ return X86::COND_AE;
+ case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
+ case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
+ return X86::COND_B;
+ case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+ return X86::COND_BE;
+ case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
+ case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
+ return X86::COND_E;
+ case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
+ case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
+ return X86::COND_G;
+ case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+ case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+ return X86::COND_GE;
+ case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
+ case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
+ return X86::COND_L;
+ case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+ case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+ return X86::COND_LE;
+ case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+ return X86::COND_NE;
+ case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+ case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+ return X86::COND_NO;
+ case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+ return X86::COND_NP;
+ case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+ case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+ return X86::COND_NS;
+ case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
+ case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
+ return X86::COND_O;
+ case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
+ case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
+ return X86::COND_P;
+ case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
+ case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
+ return X86::COND_S;
+ }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::JE_1;
+ case X86::COND_NE: return X86::JNE_1;
+ case X86::COND_L: return X86::JL_1;
+ case X86::COND_LE: return X86::JLE_1;
+ case X86::COND_G: return X86::JG_1;
+ case X86::COND_GE: return X86::JGE_1;
+ case X86::COND_B: return X86::JB_1;
+ case X86::COND_BE: return X86::JBE_1;
+ case X86::COND_A: return X86::JA_1;
+ case X86::COND_AE: return X86::JAE_1;
+ case X86::COND_S: return X86::JS_1;
+ case X86::COND_NS: return X86::JNS_1;
+ case X86::COND_P: return X86::JP_1;
+ case X86::COND_NP: return X86::JNP_1;
+ case X86::COND_O: return X86::JO_1;
+ case X86::COND_NO: return X86::JNO_1;
+ }
+}
+
+/// Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ }
+}
+
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: return X86::COND_INVALID;
+ case X86::COND_E: return X86::COND_E;
+ case X86::COND_NE: return X86::COND_NE;
+ case X86::COND_L: return X86::COND_G;
+ case X86::COND_LE: return X86::COND_GE;
+ case X86::COND_G: return X86::COND_L;
+ case X86::COND_GE: return X86::COND_LE;
+ case X86::COND_B: return X86::COND_A;
+ case X86::COND_BE: return X86::COND_AE;
+ case X86::COND_A: return X86::COND_B;
+ case X86::COND_AE: return X86::COND_BE;
+ }
+}
+
+/// Return a set opcode for the given condition and
+/// whether it has memory operand.
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
+ static const uint16_t Opc[16][2] = {
+ { X86::SETAr, X86::SETAm },
+ { X86::SETAEr, X86::SETAEm },
+ { X86::SETBr, X86::SETBm },
+ { X86::SETBEr, X86::SETBEm },
+ { X86::SETEr, X86::SETEm },
+ { X86::SETGr, X86::SETGm },
+ { X86::SETGEr, X86::SETGEm },
+ { X86::SETLr, X86::SETLm },
+ { X86::SETLEr, X86::SETLEm },
+ { X86::SETNEr, X86::SETNEm },
+ { X86::SETNOr, X86::SETNOm },
+ { X86::SETNPr, X86::SETNPm },
+ { X86::SETNSr, X86::SETNSm },
+ { X86::SETOr, X86::SETOm },
+ { X86::SETPr, X86::SETPm },
+ { X86::SETSr, X86::SETSm }
+ };
+
+ assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
+ return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand) {
+ static const uint16_t Opc[32][3] = {
+ { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
+ { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+ { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
+ { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+ { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
+ { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
+ { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+ { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
+ { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+ { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+ { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+ { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+ { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+ { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
+ { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
+ { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
+ { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
+ { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+ { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
+ { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+ { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
+ { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
+ { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+ { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
+ { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+ { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+ { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+ { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+ { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+ { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
+ { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
+ { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
+ };
+
+ assert(CC < 16 && "Can only handle standard cond codes");
+ unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+ switch(RegBytes) {
+ default: llvm_unreachable("Illegal register size!");
+ case 2: return Opc[Idx][0];
+ case 4: return Opc[Idx][1];
+ case 8: return Opc[Idx][2];
+ }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+ if (!MI->isTerminator()) return false;
+
+ // Conditional branch is a special case.
+ if (MI->isBranch() && !MI->isBarrier())
+ return true;
+ if (!MI->isPredicable())
+ return true;
+ return !isPredicated(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranchImpl(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isUnpredicatedTerminator(I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == X86::JMP_1) {
+ UnCondBrIter = I;
+
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (std::next(I) != MBB.end())
+ std::next(I)->eraseFromParent();
+
+ Cond.clear();
+ FBB = nullptr;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = nullptr;
+ I->eraseFromParent();
+ I = MBB.end();
+ UnCondBrIter = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditional destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+ if (AllowModify && UnCondBrIter != MBB.end() &&
+ MBB.isLayoutSuccessor(TargetBB)) {
+ // If we can modify the code and it ends in something like:
+ //
+ // jCC L1
+ // jmp L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Then we can change this to:
+ //
+ // jnCC L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Which is a bit more efficient.
+ // We conditionally jump to the fall-through block.
+ BranchCode = GetOppositeBranchCondition(BranchCode);
+ unsigned JNCC = GetCondBranchFromCond(BranchCode);
+ MachineBasicBlock::iterator OldInst = I;
+
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
+ .addMBB(TargetBB);
+
+ OldInst->eraseFromParent();
+ UnCondBrIter->eraseFromParent();
+
+ // Restart the analysis.
+ UnCondBrIter = MBB.end();
+ I = MBB.end();
+ continue;
+ }
+
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ CondBranches.push_back(I);
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination and their condition
+ // opcodes fit one of the special multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // Only handle the case where all conditional branches branch to the same
+ // destination.
+ if (TBB != I->getOperand(0).getMBB())
+ return true;
+
+ // If the conditions are the same, we can leave them alone.
+ X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+ if (OldBranchCode == BranchCode)
+ continue;
+
+ // If they differ, see if they fit one of the known patterns. Theoretically,
+ // we could handle more patterns here, but we shouldn't expect to see them
+ // if instruction selection has done a reasonable job.
+ if ((OldBranchCode == X86::COND_NP &&
+ BranchCode == X86::COND_E) ||
+ (OldBranchCode == X86::COND_E &&
+ BranchCode == X86::COND_NP))
+ BranchCode = X86::COND_NP_OR_E;
+ else if ((OldBranchCode == X86::COND_P &&
+ BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_NE &&
+ BranchCode == X86::COND_P))
+ BranchCode = X86::COND_NE_OR_P;
+ else
+ return true;
+
+ // Update the MachineOperand.
+ Cond[0].setImm(BranchCode);
+ CondBranches.push_back(I);
+ }
+
+ return false;
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ SmallVector<MachineInstr *, 4> CondBranches;
+ return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const {
+ using namespace std::placeholders;
+
+ SmallVector<MachineOperand, 4> Cond;
+ SmallVector<MachineInstr *, 4> CondBranches;
+ if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+ AllowModify))
+ return true;
+
+ if (Cond.size() != 1)
+ return true;
+
+ assert(MBP.TrueDest && "expected!");
+
+ if (!MBP.FalseDest)
+ MBP.FalseDest = MBB.getNextNode();
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ MachineInstr *ConditionDef = nullptr;
+ bool SingleUseCondition = true;
+
+ for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+ if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+ ConditionDef = &*I;
+ break;
+ }
+
+ if (I->readsRegister(X86::EFLAGS, TRI))
+ SingleUseCondition = false;
+ }
+
+ if (!ConditionDef)
+ return true;
+
+ if (SingleUseCondition) {
+ for (auto *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ SingleUseCondition = false;
+ }
+
+ MBP.ConditionDef = ConditionDef;
+ MBP.SingleUseCondition = SingleUseCondition;
+
+ // Currently we only recognize the simple pattern:
+ //
+ // test %reg, %reg
+ // je %label
+ //
+ const unsigned TestOpcode =
+ Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+ if (ConditionDef->getOpcode() == TestOpcode &&
+ ConditionDef->getNumOperands() == 3 &&
+ ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+ (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+ MBP.LHS = ConditionDef->getOperand(0);
+ MBP.RHS = MachineOperand::CreateImm(0);
+ MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
+ return false;
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != X86::JMP_1 &&
+ getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ DebugLoc DL) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+ switch (CC) {
+ case X86::COND_NP_OR_E:
+ // Synthesize NP_OR_E with two branches.
+ BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
+ ++Count;
+ break;
+ case X86::COND_NE_OR_P:
+ // Synthesize NE_OR_P with two branches.
+ BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
+ ++Count;
+ break;
+ default: {
+ unsigned Opc = GetCondBranchFromCond(CC);
+ BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+ ++Count;
+ }
+ }
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+ ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg,
+ int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+ // Not all subtargets have cmov instructions.
+ if (!Subtarget.hasCMov())
+ return false;
+ if (Cond.size() != 1)
+ return false;
+ // We cannot do the composite conditions, at least not in SSA form.
+ if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+ if (X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR64RegClass.hasSubClassEq(RC)) {
+ // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+ // Bridge. Probably Ivy Bridge as well.
+ CondCycles = 2;
+ TrueCycles = 2;
+ FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DstReg, ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ assert(Cond.size() == 1 && "Invalid Cond array");
+ unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+ MRI.getRegClass(DstReg)->getSize(),
+ false/*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+ return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+ const X86Subtarget &Subtarget) {
+
+ // SrcReg(VR128) -> DestReg(GR64)
+ // SrcReg(VR64) -> DestReg(GR64)
+ // SrcReg(GR64) -> DestReg(VR128)
+ // SrcReg(GR64) -> DestReg(VR64)
+
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+ if (X86::GR64RegClass.contains(DestReg)) {
+ if (X86::VR128XRegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR64 register.
+ return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
+ X86::MOVPQIto64rr);
+ if (X86::VR64RegClass.contains(SrcReg))
+ // Copy from a VR64 register to a GR64 register.
+ return X86::MMX_MOVD64from64rr;
+ } else if (X86::GR64RegClass.contains(SrcReg)) {
+ // Copy from a GR64 register to a VR128 register.
+ if (X86::VR128XRegClass.contains(DestReg))
+ return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
+ X86::MOV64toPQIrr);
+ // Copy from a GR64 register to a VR64 register.
+ if (X86::VR64RegClass.contains(DestReg))
+ return X86::MMX_MOVD64to64rr;
+ }
+
+ // SrcReg(FR32) -> DestReg(GR32)
+ // SrcReg(GR32) -> DestReg(FR32)
+
+ if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
+ // Copy from a FR32 register to a GR32 register.
+ return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
+
+ if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+ // Copy from a GR32 register to a FR32 register.
+ return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+ return 0;
+}
+
+static bool MaskRegClassContains(unsigned Reg) {
+ return X86::VK8RegClass.contains(Reg) ||
+ X86::VK16RegClass.contains(Reg) ||
+ X86::VK32RegClass.contains(Reg) ||
+ X86::VK64RegClass.contains(Reg) ||
+ X86::VK1RegClass.contains(Reg);
+}
+
+static bool GRRegClassContains(unsigned Reg) {
+ return X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg) ||
+ X86::GR16RegClass.contains(Reg) ||
+ X86::GR8RegClass.contains(Reg);
+}
+static
+unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
+ if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return X86::KMOVBrk;
+ }
+ if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return X86::KMOVBkr;
+ }
+ return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
+ if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
+ return X86::KMOVQkk;
+ if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
+ return X86::KMOVDrk;
+ if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
+ return X86::KMOVQrk;
+ if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+ return X86::KMOVDkr;
+ if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
+ return X86::KMOVQkr;
+ return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
+ const X86Subtarget &Subtarget)
+{
+ if (Subtarget.hasDQI())
+ if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
+ return Opc;
+ if (Subtarget.hasBWI())
+ if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
+ return Opc;
+ if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+ X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+ X86::VR512RegClass.contains(DestReg, SrcReg)) {
+ DestReg = get512BitSuperRegister(DestReg);
+ SrcReg = get512BitSuperRegister(SrcReg);
+ return X86::VMOVAPSZrr;
+ }
+ if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
+ return X86::KMOVWkk;
+ if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return X86::KMOVWkr;
+ }
+ if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return X86::KMOVWrk;
+ }
+ return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const {
+ // First deal with the normal symmetric copies.
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+ unsigned Opc = 0;
+ if (X86::GR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV64rr;
+ else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV32rr;
+ else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV16rr;
+ else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+ Subtarget.is64Bit()) {
+ Opc = X86::MOV8rr_NOREX;
+ // Both operands must be encodable without an REX prefix.
+ assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+ "8-bit H register can not be copied outside GR8_NOREX");
+ } else
+ Opc = X86::MOV8rr;
+ }
+ else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MMX_MOVQ64rr;
+ else if (HasAVX512)
+ Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
+ else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
+ if (!Opc)
+ Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ bool FromEFLAGS = SrcReg == X86::EFLAGS;
+ bool ToEFLAGS = DestReg == X86::EFLAGS;
+ int Reg = FromEFLAGS ? DestReg : SrcReg;
+ bool is32 = X86::GR32RegClass.contains(Reg);
+ bool is64 = X86::GR64RegClass.contains(Reg);
+
+ if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+ int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+ int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+ int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+ int Pop = is64 ? X86::POP64r : X86::POP32r;
+ int PopF = is64 ? X86::POPF64 : X86::POPF32;
+ int AX = is64 ? X86::RAX : X86::EAX;
+
+ if (!Subtarget.hasLAHFSAHF()) {
+ assert(Subtarget.is64Bit() &&
+ "Not having LAHF/SAHF only happens on 64-bit.");
+ // Moving EFLAGS to / from another register requires a push and a pop.
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - usesTheStack.
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(PushF));
+ BuildMI(MBB, MI, DL, get(Pop), DestReg);
+ }
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Push))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(PopF));
+ }
+ return;
+ }
+
+ // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+ // inefficient. Instead:
+ // - Save the overflow flag OF into AL using SETO, and restore it using a
+ // signed 8-bit addition of AL and INT8_MAX.
+ // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+ // using LAHF/SAHF.
+ // - When RAX/EAX is live and isn't the destination register, make sure it
+ // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+ // the flags.
+ // This approach is ~2.25x faster than using PUSHF/POPF.
+ //
+ // This is still somewhat inefficient because we don't know which flags are
+ // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+ // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+ //
+ // PUSHF/POPF is also potentially incorrect because it affects other flags
+ // such as TF/IF/DF, which LLVM doesn't model.
+ //
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index.
+ // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
+
+
+ bool AXDead = (Reg == AX) ||
+ (MachineBasicBlock::LQR_Dead ==
+ MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+ if (!AXDead) {
+ // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may
+ // actually be dead. This is not a problem for correctness as we are just
+ // (unnecessarily) saving+restoring a dead register. However the
+ // MachineVerifier expects operands that read from dead registers
+ // to be marked with the "undef" flag.
+ // An example of this can be found in
+ // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and
+ // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using
+ // -verify-machineinstrs.
+ BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+ }
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+ BuildMI(MBB, MI, DL, get(X86::LAHF));
+ BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
+ }
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+ .addReg(X86::AL)
+ .addImm(INT8_MAX);
+ BuildMI(MBB, MI, DL, get(X86::SAHF));
+ }
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Pop), AX);
+ return;
+ }
+
+ DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+ << " to " << RI.getName(DestReg) << '\n');
+ llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const X86Subtarget &STI,
+ bool load) {
+ if (STI.hasAVX512()) {
+ if (X86::VK8RegClass.hasSubClassEq(RC) ||
+ X86::VK16RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVWkm : X86::KMOVWmk;
+ if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
+ return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
+ if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
+ return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
+ if (X86::VR512RegClass.hasSubClassEq(RC))
+ return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ switch (RC->getSize()) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 1:
+ assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+ if (STI.is64Bit())
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+ return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+ return load ? X86::MOV8rm : X86::MOV8mr;
+ case 2:
+ assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+ return load ? X86::MOV16rm : X86::MOV16mr;
+ case 4:
+ if (X86::GR32RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV32rm : X86::MOV32mr;
+ if (X86::FR32RegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+ (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ if (X86::RFP32RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+ llvm_unreachable("Unknown 4-byte regclass");
+ case 8:
+ if (X86::GR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV64rm : X86::MOV64mr;
+ if (X86::FR64RegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+ (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ if (X86::VR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+ if (X86::RFP64RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+ llvm_unreachable("Unknown 8-byte regclass");
+ case 10:
+ assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+ return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+ case 16: {
+ assert((X86::VR128RegClass.hasSubClassEq(RC) ||
+ X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ?
+ (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+ (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+ else
+ return load ?
+ (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+ (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+ }
+ case 32:
+ assert((X86::VR256RegClass.hasSubClassEq(RC) ||
+ X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+ else
+ return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+ case 64:
+ assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+ if (isStackAligned)
+ return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+ else
+ return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+ }
+}
+
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg,
+ unsigned &Offset,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemOp->getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode());
+ if (MemRefBegin < 0)
+ return false;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg();
+ if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+ return false;
+
+ if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+ X86::NoRegister)
+ return false;
+
+ const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp);
+
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return false;
+
+ Offset = DispMO.getImm();
+
+ return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
+ X86::NoRegister);
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+ "Stack slot too small for store");
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+ bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned = MMOBegin != MMOEnd &&
+ (*MMOBegin)->getAlignment() >= Alignment;
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ MIB.addReg(SrcReg, getKillRegState(isKill));
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+ bool isAligned = MMOBegin != MMOEnd &&
+ (*MMOBegin)->getAlignment() >= Alignment;
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+ int &CmpMask, int &CmpValue) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI->getOperand(1).getImm();
+ return true;
+ // A SUB can be used to perform comparison.
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = MI->getOperand(2).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI->getOperand(2).getImm();
+ return true;
+ case X86::CMP64rr:
+ case X86::CMP32rr:
+ case X86::CMP16rr:
+ case X86::CMP8rr:
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = MI->getOperand(1).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ SrcReg = MI->getOperand(0).getReg();
+ if (MI->getOperand(1).getReg() != SrcReg) return false;
+ // Compare against zero.
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ }
+ return false;
+}
+
+/// Check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+ unsigned SrcReg2, int ImmValue,
+ MachineInstr *OI) {
+ if (((FlagI->getOpcode() == X86::CMP64rr &&
+ OI->getOpcode() == X86::SUB64rr) ||
+ (FlagI->getOpcode() == X86::CMP32rr &&
+ OI->getOpcode() == X86::SUB32rr)||
+ (FlagI->getOpcode() == X86::CMP16rr &&
+ OI->getOpcode() == X86::SUB16rr)||
+ (FlagI->getOpcode() == X86::CMP8rr &&
+ OI->getOpcode() == X86::SUB8rr)) &&
+ ((OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getReg() == SrcReg2) ||
+ (OI->getOperand(1).getReg() == SrcReg2 &&
+ OI->getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+ OI->getOpcode() == X86::SUB64ri32) ||
+ (FlagI->getOpcode() == X86::CMP64ri8 &&
+ OI->getOpcode() == X86::SUB64ri8) ||
+ (FlagI->getOpcode() == X86::CMP32ri &&
+ OI->getOpcode() == X86::SUB32ri) ||
+ (FlagI->getOpcode() == X86::CMP32ri8 &&
+ OI->getOpcode() == X86::SUB32ri8) ||
+ (FlagI->getOpcode() == X86::CMP16ri &&
+ OI->getOpcode() == X86::SUB16ri) ||
+ (FlagI->getOpcode() == X86::CMP16ri8 &&
+ OI->getOpcode() == X86::SUB16ri8) ||
+ (FlagI->getOpcode() == X86::CMP8ri &&
+ OI->getOpcode() == X86::SUB8ri)) &&
+ OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+/// Check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return false;
+
+ // The shift instructions only modify ZF if their shift count is non-zero.
+ // N.B.: The processor truncates the shift count depending on the encoding.
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
+ return getTruncatedShiftCount(MI, 2) != 0;
+
+ // Some left shift instructions can be turned into LEA instructions but only
+ // if their flags aren't used. Avoid transforming such instructions.
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+ return ShAmt != 0;
+ }
+
+ case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+ case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+ return getTruncatedShiftCount(MI, 3) != 0;
+
+ case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+ case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
+ case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
+ case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
+ case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
+ case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+ case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
+ case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
+ case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
+ case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
+ case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+ case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
+ case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
+ case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
+ case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+ case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
+ case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
+ case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
+ case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
+ case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
+ case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
+ case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
+ case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
+ case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
+ case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
+ case X86::ADC32ri: case X86::ADC32ri8:
+ case X86::ADC32rr: case X86::ADC64ri32:
+ case X86::ADC64ri8: case X86::ADC64rr:
+ case X86::SBB32ri: case X86::SBB32ri8:
+ case X86::SBB32rr: case X86::SBB64ri32:
+ case X86::SBB64ri8: case X86::SBB64rr:
+ case X86::ANDN32rr: case X86::ANDN32rm:
+ case X86::ANDN64rr: case X86::ANDN64rm:
+ case X86::BEXTR32rr: case X86::BEXTR64rr:
+ case X86::BEXTR32rm: case X86::BEXTR64rm:
+ case X86::BLSI32rr: case X86::BLSI32rm:
+ case X86::BLSI64rr: case X86::BLSI64rm:
+ case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+ case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+ case X86::BLSR32rr: case X86::BLSR32rm:
+ case X86::BLSR64rr: case X86::BLSR64rm:
+ case X86::BZHI32rr: case X86::BZHI32rm:
+ case X86::BZHI64rr: case X86::BZHI64rm:
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
+ return true;
+ }
+}
+
+/// Check whether the use can be converted to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ return X86::COND_B;
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ return X86::COND_E;
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
+ return X86::COND_B;
+ }
+}
+
+/// Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+ int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const {
+ // Check whether we can replace SUB with CMP.
+ unsigned NewOpcode = 0;
+ switch (CmpInstr->getOpcode()) {
+ default: break;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr: {
+ if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+ return false;
+ // There is no use of the destination register, we can replace SUB with CMP.
+ switch (CmpInstr->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
+ case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
+ case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
+ case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
+ case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
+ case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
+ case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
+ case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
+ case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+ case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
+ case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
+ case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
+ case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
+ case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
+ case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
+ }
+ CmpInstr->setDesc(get(NewOpcode));
+ CmpInstr->RemoveOperand(0);
+ // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+ if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+ NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+ return false;
+ }
+ }
+
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+
+ // CmpInstr is the first instruction of the BB.
+ MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+ // If we are comparing against zero, check whether we can use MI to update
+ // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+ bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+ if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
+ return false;
+
+ // If we have a use of the source register between the def and our compare
+ // instruction we can eliminate the compare iff the use sets EFLAGS in the
+ // right way.
+ bool ShouldUpdateCC = false;
+ X86::CondCode NewCC = X86::COND_INVALID;
+ if (IsCmpZero && !isDefConvertible(MI)) {
+ // Scan forward from the use until we hit the use we're looking for or the
+ // compare instruction.
+ for (MachineBasicBlock::iterator J = MI;; ++J) {
+ // Do we have a convertible instruction?
+ NewCC = isUseDefConvertible(J);
+ if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+ J->getOperand(1).getReg() == SrcReg) {
+ assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+ ShouldUpdateCC = true; // Update CC later on.
+ // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+ // with the new def.
+ MI = Def = J;
+ break;
+ }
+
+ if (J == I)
+ return false;
+ }
+ }
+
+ // We are searching for an earlier instruction that can make CmpInstr
+ // redundant and that instruction will be saved in Sub.
+ MachineInstr *Sub = nullptr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // We iterate backward, starting from the instruction before CmpInstr and
+ // stop when reaching the definition of a source register or done with the BB.
+ // RI points to the instruction before CmpInstr.
+ // If the definition is in this basic block, RE points to the definition;
+ // otherwise, RE is the rend of the basic block.
+ MachineBasicBlock::reverse_iterator
+ RI = MachineBasicBlock::reverse_iterator(I),
+ RE = CmpInstr->getParent() == MI->getParent() ?
+ MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+ CmpInstr->getParent()->rend();
+ MachineInstr *Movr0Inst = nullptr;
+ for (; RI != RE; ++RI) {
+ MachineInstr *Instr = &*RI;
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (!IsCmpZero &&
+ isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+ Sub = Instr;
+ break;
+ }
+
+ if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+ Instr->readsRegister(X86::EFLAGS, TRI)) {
+ // This instruction modifies or uses EFLAGS.
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // They are safe to move up, if the definition to EFLAGS is dead and
+ // earlier instructions do not read or write EFLAGS.
+ if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
+ Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+ Movr0Inst = Instr;
+ continue;
+ }
+
+ // We can't remove CmpInstr.
+ return false;
+ }
+ }
+
+ // Return false if no candidates exist.
+ if (!IsCmpZero && !Sub)
+ return false;
+
+ bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
+
+ // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+ // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+ // If we are done with the basic block, we need to check whether EFLAGS is
+ // live-out.
+ bool IsSafe = false;
+ SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+ MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+ for (++I; I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+ bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+ // We should check the usage if this instruction uses and updates EFLAGS.
+ if (!UseEFLAGS && ModifyEFLAGS) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again.
+ IsSafe = true;
+ break;
+ }
+ if (!UseEFLAGS && !ModifyEFLAGS)
+ continue;
+
+ // EFLAGS is used by this instruction.
+ X86::CondCode OldCC = X86::COND_INVALID;
+ bool OpcIsSET = false;
+ if (IsCmpZero || IsSwapped) {
+ // We decode the condition code from opcode.
+ if (Instr.isBranch())
+ OldCC = getCondFromBranchOpc(Instr.getOpcode());
+ else {
+ OldCC = getCondFromSETOpc(Instr.getOpcode());
+ if (OldCC != X86::COND_INVALID)
+ OpcIsSET = true;
+ else
+ OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+ }
+ if (OldCC == X86::COND_INVALID) return false;
+ }
+ if (IsCmpZero) {
+ switch (OldCC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ // CF and OF are used, we can't perform this optimization.
+ return false;
+ }
+
+ // If we're updating the condition code check if we have to reverse the
+ // condition.
+ if (ShouldUpdateCC)
+ switch (OldCC) {
+ default:
+ return false;
+ case X86::COND_E:
+ break;
+ case X86::COND_NE:
+ NewCC = GetOppositeBranchCondition(NewCC);
+ break;
+ }
+ } else if (IsSwapped) {
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+ // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ // We swap the condition code and synthesize the new opcode.
+ NewCC = getSwappedCondition(OldCC);
+ if (NewCC == X86::COND_INVALID) return false;
+ }
+
+ if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
+ // Synthesize the new opcode.
+ bool HasMemoryOperand = Instr.hasOneMemOperand();
+ unsigned NewOpc;
+ if (Instr.isBranch())
+ NewOpc = GetCondBranchFromCond(NewCC);
+ else if(OpcIsSET)
+ NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+ else {
+ unsigned DstReg = Instr.getOperand(0).getReg();
+ NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+ HasMemoryOperand);
+ }
+
+ // Push the MachineInstr to OpsToUpdate.
+ // If it is safe to remove CmpInstr, the condition code of these
+ // instructions will be modified.
+ OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+ }
+ if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+ IsSafe = true;
+ break;
+ }
+ }
+
+ // If EFLAGS is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if ((IsCmpZero || IsSwapped) && !IsSafe) {
+ MachineBasicBlock *MBB = CmpInstr->getParent();
+ for (MachineBasicBlock *Successor : MBB->successors())
+ if (Successor->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+
+ // The instruction to be updated is either Sub or MI.
+ Sub = IsCmpZero ? MI : Sub;
+ // Move Movr0Inst to the appropriate place before Sub.
+ if (Movr0Inst) {
+ // Look backwards until we find a def that doesn't use the current EFLAGS.
+ Def = Sub;
+ MachineBasicBlock::reverse_iterator
+ InsertI = MachineBasicBlock::reverse_iterator(++Def),
+ InsertE = Sub->getParent()->rend();
+ for (; InsertI != InsertE; ++InsertI) {
+ MachineInstr *Instr = &*InsertI;
+ if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+ Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+ Sub->getParent()->remove(Movr0Inst);
+ Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+ Movr0Inst);
+ break;
+ }
+ }
+ if (InsertI == InsertE)
+ return false;
+ }
+
+ // Make sure Sub instruction defines EFLAGS and mark the def live.
+ unsigned i = 0, e = Sub->getNumOperands();
+ for (; i != e; ++i) {
+ MachineOperand &MO = Sub->getOperand(i);
+ if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+ MO.setIsDead(false);
+ break;
+ }
+ }
+ assert(i != e && "Unable to locate a def EFLAGS operand");
+
+ CmpInstr->eraseFromParent();
+
+ // Modify the condition code of instructions in OpsToUpdate.
+ for (auto &Op : OpsToUpdate)
+ Op.first->setDesc(get(Op.second));
+ return true;
+}
+
+/// Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ if (FoldAsLoadDefReg == 0)
+ return nullptr;
+ // To be conservative, if there exists another load, clear the load candidate.
+ if (MI->mayLoad()) {
+ FoldAsLoadDefReg = 0;
+ return nullptr;
+ }
+
+ // Check whether we can move DefMI here.
+ DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+ assert(DefMI);
+ bool SawStore = false;
+ if (!DefMI->isSafeToMove(nullptr, SawStore))
+ return nullptr;
+
+ // Collect information about virtual register operands of MI.
+ unsigned SrcOperandId = 0;
+ bool FoundSrcOperand = false;
+ for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg != FoldAsLoadDefReg)
+ continue;
+ // Do not fold if we have a subreg use or a def or multiple uses.
+ if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+ return nullptr;
+
+ SrcOperandId = i;
+ FoundSrcOperand = true;
+ }
+ if (!FoundSrcOperand)
+ return nullptr;
+
+ // Check whether we can fold the def into SrcOperandId.
+ if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) {
+ FoldAsLoadDefReg = 0;
+ return FoldMI;
+ }
+
+ return nullptr;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
+/// %xmm4 = V_SET0
+/// to:
+/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ unsigned Reg = MIB->getOperand(0).getReg();
+ MIB->setDesc(Desc);
+
+ // MachineInstr::addOperand() will insert explicit operands before any
+ // implicit operands.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ // But we don't trust that.
+ assert(MIB->getOperand(1).getReg() == Reg &&
+ MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+ return true;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+/// %k4 = K_SET1
+/// to:
+/// %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc, unsigned Reg) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ MIB->setDesc(Desc);
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+ bool MinusOne) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ unsigned Reg = MIB->getOperand(0).getReg();
+
+ // Insert the XOR.
+ BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+
+ // Turn the pseudo into an INC or DEC.
+ MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+ MIB.addReg(Reg);
+
+ return true;
+}
+
+// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
+// code sequence is needed for other targets.
+static void expandLoadStackGuard(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ unsigned Reg = MIB->getOperand(0).getReg();
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
+ unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
+ MachineBasicBlock::iterator I = MIB.getInstr();
+
+ BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
+ .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+ .addMemOperand(MMO);
+ MIB->setDebugLoc(DL);
+ MIB->setDesc(TII.get(X86::MOV64rm));
+ MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+ bool HasAVX = Subtarget.hasAVX();
+ MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+ switch (MI->getOpcode()) {
+ case X86::MOV32r0:
+ return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+ case X86::MOV32r1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+ case X86::MOV32r_1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+ case X86::SETB_C8r:
+ return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+ case X86::SETB_C16r:
+ return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+ case X86::SETB_C32r:
+ return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+ case X86::SETB_C64r:
+ return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+ case X86::V_SET0:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0SD:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+ case X86::AVX_SET0:
+ assert(HasAVX && "AVX not supported");
+ return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+ case X86::AVX512_512_SET0:
+ return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+ case X86::V_SETALLONES:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+ case X86::AVX2_SETALLONES:
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::TEST8ri_NOREX:
+ MI->setDesc(get(X86::TEST8ri));
+ return true;
+ case X86::MOV32ri64:
+ MI->setDesc(get(X86::MOV32ri));
+ return true;
+
+ // KNL does not recognize dependency-breaking idioms for mask registers,
+ // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+ // Using %k0 as the undef input register is a performance heuristic based
+ // on the assumption that %k0 is used less frequently than the other mask
+ // registers, since it is not usable as a write mask.
+ // FIXME: A more advanced approach would be to choose the best input mask
+ // register based on context.
+ case X86::KSET0B:
+ case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+ case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+ case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
+ case X86::KSET1B:
+ case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+ case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+ case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
+ case TargetOpcode::LOAD_STACK_GUARD:
+ expandLoadStackGuard(MIB, *this);
+ return true;
+ }
+ return false;
+}
+
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+ int PtrOffset = 0) {
+ unsigned NumAddrOps = MOs.size();
+
+ if (NumAddrOps < 4) {
+ // FrameIndex only - add an immediate offset (whether its zero or not).
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ addOffset(MIB, PtrOffset);
+ } else {
+ // General Memory Addressing - we need to add any offset to an existing
+ // offset.
+ assert(MOs.size() == 5 && "Unexpected memory operand list length");
+ for (unsigned i = 0; i != NumAddrOps; ++i) {
+ const MachineOperand &MO = MOs[i];
+ if (i == 3 && PtrOffset != 0) {
+ MIB.addDisp(MO, PtrOffset);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ }
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr *MI,
+ const TargetInstrInfo &TII) {
+ // Create the base instruction with the memory operand as the first part.
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+ addOperands(MIB, MOs);
+
+ // Loop over the rest of the ri operands, converting them over.
+ unsigned NumOps = MI->getDesc().getNumOperands()-2;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI->getOperand(i+2);
+ MIB.addOperand(MO);
+ }
+ for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ MIB.addOperand(MO);
+ }
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
+ return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
+ unsigned OpNo, ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr *MI, const TargetInstrInfo &TII,
+ int PtrOffset = 0) {
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ addOperands(MIB, MOs, PtrOffset);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr *MI) {
+ MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+ MI->getDebugLoc(), TII.get(Opcode));
+ addOperands(MIB, MOs);
+ return MIB.addImm(0);
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+ MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const {
+ switch (MI->getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ // Attempt to convert the load of inserted vector into a fold load
+ // of a single float.
+ if (OpNum == 2) {
+ unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size <= RCSize && 4 <= Align) {
+ int PtrOffset = SrcIdx * 4;
+ unsigned NewImm = (DstIdx << 4) | ZMask;
+ unsigned NewOpCode =
+ (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+ : X86::INSERTPSrm);
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+ NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+ return NewMI;
+ }
+ }
+ break;
+ };
+
+ return nullptr;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align, bool AllowCommute) const {
+ const DenseMap<unsigned,
+ std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
+ bool isCallRegIndirect = Subtarget.callRegIndirect();
+ bool isTwoAddrFold = false;
+
+ // For CPUs that favor the register form of a call or push,
+ // do not fold loads into calls or pushes, unless optimizing for size
+ // aggressively.
+ if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+ (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+ MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+ MI->getOpcode() == X86::PUSH64r))
+ return nullptr;
+
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // FIXME: AsmPrinter doesn't know how to handle
+ // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+ if (MI->getOpcode() == X86::ADD32ri &&
+ MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ return nullptr;
+
+ MachineInstr *NewMI = nullptr;
+
+ // Attempt to fold any custom cases we have.
+ if (MachineInstr *CustomMI =
+ foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+ return CustomMI;
+
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && OpNum < 2 &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() &&
+ MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ isTwoAddrFold = true;
+ } else if (OpNum == 0) {
+ if (MI->getOpcode() == X86::MOV32r0) {
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+ if (NewMI)
+ return NewMI;
+ }
+
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (OpNum == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (OpNum == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ } else if (OpNum == 3) {
+ OpcodeTablePtr = &RegOp2MemOpTable3;
+ } else if (OpNum == 4) {
+ OpcodeTablePtr = &RegOp2MemOpTable4;
+ }
+
+ // If table selected...
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ OpcodeTablePtr->find(MI->getOpcode());
+ if (I != OpcodeTablePtr->end()) {
+ unsigned Opcode = I->second.first;
+ unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+ if (Align < MinAlign)
+ return nullptr;
+ bool NarrowToMOV32rm = false;
+ if (Size) {
+ unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size < RCSize) {
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return nullptr;
+ // If this is a 64-bit load, but the spill slot is 32, then we can do
+ // a 32-bit load which is implicitly zero-extended. This likely is
+ // due to live interval analysis remat'ing a load from stack slot.
+ if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+ return nullptr;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
+ }
+ }
+
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ unsigned DstReg = NewMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
+ else
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+ }
+ return NewMI;
+ }
+ }
+
+ // If the instruction and target operand are commutable, commute the
+ // instruction and try again.
+ if (AllowCommute) {
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+ if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+ bool HasDef = MI->getDesc().getNumDefs();
+ unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
+ unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
+ unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
+ bool Tied1 =
+ 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
+ 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+ // If either of the commutable operands are tied to the destination
+ // then we can not commute + fold.
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
+ return nullptr;
+
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Attempt to fold with the commuted version of the instruction.
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+ Size, Align, /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
+
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing && !MI->isCopy())
+ dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
+ return nullptr;
+}
+
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+/// movss (%rdi), %xmm0
+/// cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+/// cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SSrm:
+ case X86::CVTSI2SS64rr:
+ case X86::CVTSI2SS64rm:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SDrm:
+ case X86::CVTSI2SD64rr:
+ case X86::CVTSI2SD64rm:
+ case X86::CVTSD2SSrr:
+ case X86::CVTSD2SSrm:
+ case X86::Int_CVTSD2SSrr:
+ case X86::Int_CVTSD2SSrm:
+ case X86::CVTSS2SDrr:
+ case X86::CVTSS2SDrm:
+ case X86::Int_CVTSS2SDrr:
+ case X86::Int_CVTSS2SDrm:
+ case X86::RCPSSr:
+ case X86::RCPSSm:
+ case X86::RCPSSr_Int:
+ case X86::RCPSSm_Int:
+ case X86::ROUNDSDr:
+ case X86::ROUNDSDm:
+ case X86::ROUNDSDr_Int:
+ case X86::ROUNDSSr:
+ case X86::ROUNDSSm:
+ case X86::ROUNDSSr_Int:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSm:
+ case X86::RSQRTSSr_Int:
+ case X86::RSQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSm:
+ case X86::SQRTSSr_Int:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSDm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI->getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI->readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any of the preceding 16 instructions are reading Reg, insert a
+ // dependency breaking instruction. The magic number is based on a few
+ // Nehalem experiments.
+ return 16;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::VCVTSI2SSrr:
+ case X86::VCVTSI2SSrm:
+ case X86::Int_VCVTSI2SSrr:
+ case X86::Int_VCVTSI2SSrm:
+ case X86::VCVTSI2SS64rr:
+ case X86::VCVTSI2SS64rm:
+ case X86::Int_VCVTSI2SS64rr:
+ case X86::Int_VCVTSI2SS64rm:
+ case X86::VCVTSI2SDrr:
+ case X86::VCVTSI2SDrm:
+ case X86::Int_VCVTSI2SDrr:
+ case X86::Int_VCVTSI2SDrm:
+ case X86::VCVTSI2SD64rr:
+ case X86::VCVTSI2SD64rm:
+ case X86::Int_VCVTSI2SD64rr:
+ case X86::Int_VCVTSI2SD64rm:
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::Int_VCVTSD2SSrr:
+ case X86::Int_VCVTSD2SSrm:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::Int_VCVTSS2SDrr:
+ case X86::Int_VCVTSS2SDrm:
+ case X86::VRCPSSr:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
+ case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrm:
+ case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrm:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (!hasUndefRegUpdate(MI->getOpcode()))
+ return 0;
+
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ // Use the same magic number as getPartialRegUpdateClearance.
+ return 16;
+ }
+ return 0;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ // If MI kills this register, the false dependence is already broken.
+ if (MI->killsRegister(Reg, TRI))
+ return;
+
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ MI->addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI->addRegisterKilled(Reg, TRI, true);
+ }
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+ // Check switch flag
+ if (NoFusing)
+ return nullptr;
+
+ // Unless optimizing for size, don't fold to avoid partial
+ // register update stalls
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
+ return nullptr;
+
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ unsigned Size = MFI->getObjectSize(FrameIndex);
+ unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+ // If the function stack isn't realigned we don't want to fold instructions
+ // that need increased alignment.
+ if (!RI.needsStackRealignment(MF))
+ Alignment =
+ std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ unsigned RCSize = 0;
+ switch (MI->getOpcode()) {
+ default: return nullptr;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+ }
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Size < RCSize)
+ return nullptr;
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return nullptr;
+
+ return foldMemoryOperandImpl(MF, MI, Ops[0],
+ MachineOperand::CreateFI(FrameIndex), InsertPt,
+ Size, Alignment, /*AllowCommute=*/true);
+}
+
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version. For instance, this transformation isn't legal:
+/// movss (%rdi), %xmm0
+/// addps %xmm0, %xmm0
+/// ->
+/// addps (%rdi), %xmm0
+///
+/// But this one is:
+/// movss (%rdi), %xmm0
+/// addss %xmm0, %xmm0
+/// ->
+/// addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+ const MachineInstr &UserMI,
+ const MachineFunction &MF) {
+ unsigned Opc = LoadMI.getOpcode();
+ unsigned UserOpc = UserMI.getOpcode();
+ unsigned RegSize =
+ MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
+ // These instructions only load 32 bits, we can't fold them if the
+ // destination register is wider than 32 bits (4 bytes), and its user
+ // instruction isn't scalar (SS).
+ switch (UserOpc) {
+ case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+ case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+ case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+ case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+ case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
+ case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
+ case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
+ case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
+ case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
+ case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
+ // These instructions only load 64 bits, we can't fold them if the
+ // destination register is wider than 64 bits (8 bytes), and its user
+ // instruction isn't scalar (SD).
+ switch (UserOpc) {
+ case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+ case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+ case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+ case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+ case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
+ case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
+ case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
+ case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
+ case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
+ case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
+ // If loading from a FrameIndex, fold directly from the FrameIndex.
+ unsigned NumOps = LoadMI->getDesc().getNumOperands();
+ int FrameIndex;
+ if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
+ return nullptr;
+ return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
+ }
+
+ // Check switch flag
+ if (NoFusing) return nullptr;
+
+ // Avoid partial register update stalls unless optimizing for size.
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
+ return nullptr;
+
+ // Determine the alignment of the load.
+ unsigned Alignment = 0;
+ if (LoadMI->hasOneMemOperand())
+ Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+ else
+ switch (LoadMI->getOpcode()) {
+ case X86::AVX2_SETALLONES:
+ case X86::AVX_SET0:
+ Alignment = 32;
+ break;
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ Alignment = 16;
+ break;
+ case X86::FsFLD0SD:
+ Alignment = 8;
+ break;
+ case X86::FsFLD0SS:
+ Alignment = 4;
+ break;
+ default:
+ return nullptr;
+ }
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI->getOpcode()) {
+ default: return nullptr;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return nullptr;
+
+ // Make sure the subregisters match.
+ // Otherwise we risk changing the size of the load.
+ if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+ return nullptr;
+
+ SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+ switch (LoadMI->getOpcode()) {
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX_SET0:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0SS: {
+ // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+ // Create a constant-pool entry and operands to load from it.
+
+ // Medium and large mode can't fold loads this way.
+ if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+ MF.getTarget().getCodeModel() != CodeModel::Kernel)
+ return nullptr;
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget.is64Bit())
+ PICBase = X86::RIP;
+ else
+ // FIXME: PICBase = getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return nullptr;
+ }
+
+ // Create a constant-pool entry.
+ MachineConstantPool &MCP = *MF.getConstantPool();
+ Type *Ty;
+ unsigned Opc = LoadMI->getOpcode();
+ if (Opc == X86::FsFLD0SS)
+ Ty = Type::getFloatTy(MF.getFunction()->getContext());
+ else if (Opc == X86::FsFLD0SD)
+ Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+ else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+ else
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+ bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
+ const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+ Constant::getNullValue(Ty);
+ unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+ // Create operands to load from the constant pool entry.
+ MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+ MOs.push_back(MachineOperand::CreateImm(1));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ break;
+ }
+ default: {
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
+ return nullptr;
+
+ // Folding a normal load. Just copy the load's address operands.
+ MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands,
+ LoadMI->operands_begin() + NumOps);
+ break;
+ }
+ }
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
+ /*Size=*/0, Alignment, /*AllowCommute=*/true);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find(MI->getOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & TB_INDEX_MASK;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return false;
+ UnfoldLoad &= FoldedLoad;
+ if (UnfoldStore && !FoldedStore)
+ return false;
+ UnfoldStore &= FoldedStore;
+
+ const MCInstrDesc &MCID = get(Opc);
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ // TODO: Check if 32-byte or greater accesses are slow too?
+ if (!MI->hasOneMemOperand() &&
+ RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+ // conservatively assume the address is unaligned. That's bad for
+ // performance.
+ return false;
+ SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+ SmallVector<MachineOperand,2> BeforeOps;
+ SmallVector<MachineOperand,2> AfterOps;
+ SmallVector<MachineOperand,4> ImpOps;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (i >= Index && i < Index + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (Op.isReg() && Op.isImplicit())
+ ImpOps.push_back(Op);
+ else if (i < Index)
+ BeforeOps.push_back(Op);
+ else if (i > Index)
+ AfterOps.push_back(Op);
+ }
+
+ // Emit the load instruction.
+ if (UnfoldLoad) {
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(MI->memoperands_begin(),
+ MI->memoperands_end());
+ loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+ if (UnfoldStore) {
+ // Address operands cannot be marked isKill.
+ for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+ MachineOperand &MO = NewMIs[0]->getOperand(i);
+ if (MO.isReg())
+ MO.setIsKill(false);
+ }
+ }
+ }
+
+ // Emit the data processing instruction.
+ MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, DataMI);
+
+ if (FoldedStore)
+ MIB.addReg(Reg, RegState::Define);
+ for (MachineOperand &BeforeOp : BeforeOps)
+ MIB.addOperand(BeforeOp);
+ if (FoldedLoad)
+ MIB.addReg(Reg);
+ for (MachineOperand &AfterOp : AfterOps)
+ MIB.addOperand(AfterOp);
+ for (MachineOperand &ImpOp : ImpOps) {
+ MIB.addReg(ImpOp.getReg(),
+ getDefRegState(ImpOp.isDef()) |
+ RegState::Implicit |
+ getKillRegState(ImpOp.isKill()) |
+ getDeadRegState(ImpOp.isDead()) |
+ getUndefRegState(ImpOp.isUndef()));
+ }
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri: {
+ MachineOperand &MO0 = DataMI->getOperand(0);
+ MachineOperand &MO1 = DataMI->getOperand(1);
+ if (MO1.getImm() == 0) {
+ unsigned NewOpc;
+ switch (DataMI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
+ case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
+ }
+ DataMI->setDesc(get(NewOpc));
+ MO1.ChangeToRegister(MO0.getReg(), false);
+ }
+ }
+ }
+ NewMIs.push_back(DataMI);
+
+ // Emit the store instruction.
+ if (UnfoldStore) {
+ const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(MI->memoperands_begin(),
+ MI->memoperands_end());
+ storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+ }
+
+ return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const {
+ if (!N->isMachineOpcode())
+ return false;
+
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find(N->getMachineOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & TB_INDEX_MASK;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ const MCInstrDesc &MCID = get(Opc);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ unsigned NumDefs = MCID.NumDefs;
+ std::vector<SDValue> AddrOps;
+ std::vector<SDValue> BeforeOps;
+ std::vector<SDValue> AfterOps;
+ SDLoc dl(N);
+ unsigned NumOps = N->getNumOperands();
+ for (unsigned i = 0; i != NumOps-1; ++i) {
+ SDValue Op = N->getOperand(i);
+ if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (i < Index-NumDefs)
+ BeforeOps.push_back(Op);
+ else if (i > Index-NumDefs)
+ AfterOps.push_back(Op);
+ }
+ SDValue Chain = N->getOperand(NumOps-1);
+ AddrOps.push_back(Chain);
+
+ // Emit the load instruction.
+ SDNode *Load = nullptr;
+ if (FoldedLoad) {
+ EVT VT = *RC->vt_begin();
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ if (!(*MMOs.first) &&
+ RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Do not introduce a slow unaligned load.
+ return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = (*MMOs.first) &&
+ (*MMOs.first)->getAlignment() >= Alignment;
+ Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
+ VT, MVT::Other, AddrOps);
+ NewNodes.push_back(Load);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ // Emit the data processing instruction.
+ std::vector<EVT> VTs;
+ const TargetRegisterClass *DstRC = nullptr;
+ if (MCID.getNumDefs() > 0) {
+ DstRC = getRegClass(MCID, 0, &RI, MF);
+ VTs.push_back(*DstRC->vt_begin());
+ }
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ EVT VT = N->getValueType(i);
+ if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+ VTs.push_back(VT);
+ }
+ if (Load)
+ BeforeOps.push_back(SDValue(Load, 0));
+ BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+ SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+ NewNodes.push_back(NewNode);
+
+ // Emit the store instruction.
+ if (FoldedStore) {
+ AddrOps.pop_back();
+ AddrOps.push_back(SDValue(NewNode, 0));
+ AddrOps.push_back(Chain);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ if (!(*MMOs.first) &&
+ RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Do not introduce a slow unaligned store.
+ return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = (*MMOs.first) &&
+ (*MMOs.first)->getAlignment() >= Alignment;
+ SDNode *Store =
+ DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+ dl, MVT::Other, AddrOps);
+ NewNodes.push_back(Store);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex) const {
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find(Opc);
+ if (I == MemOp2RegOpTable.end())
+ return 0;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return 0;
+ if (UnfoldStore && !FoldedStore)
+ return 0;
+ if (LoadRegIndex)
+ *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+ return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const {
+ if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+ return false;
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ switch (Opc1) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::FsVMOVAPSrm:
+ case X86::FsVMOVAPDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ break;
+ }
+ switch (Opc2) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::FsVMOVAPSrm:
+ case X86::FsVMOVAPDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ break;
+ }
+
+ // Check if chain operands and base addresses match.
+ if (Load1->getOperand(0) != Load2->getOperand(0) ||
+ Load1->getOperand(5) != Load2->getOperand(5))
+ return false;
+ // Segment operands should match as well.
+ if (Load1->getOperand(4) != Load2->getOperand(4))
+ return false;
+ // Scale should be 1, Index should be Reg0.
+ if (Load1->getOperand(1) == Load2->getOperand(1) &&
+ Load1->getOperand(2) == Load2->getOperand(2)) {
+ if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+ return false;
+
+ // Now let's examine the displacements.
+ if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+ isa<ConstantSDNode>(Load2->getOperand(3))) {
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+ Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const {
+ assert(Offset2 > Offset1);
+ if ((Offset2 - Offset1) / 8 > 64)
+ return false;
+
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ if (Opc1 != Opc2)
+ return false; // FIXME: overly conservative?
+
+ switch (Opc1) {
+ default: break;
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return false;
+ }
+
+ EVT VT = Load1->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+ // have 16 of them to play with.
+ if (Subtarget.is64Bit()) {
+ if (NumLoads >= 3)
+ return false;
+ } else if (NumLoads) {
+ return false;
+ }
+ break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ if (NumLoads)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
+ MachineInstr *Second) const {
+ // Check if this processor supports macro-fusion. Since this is a minor
+ // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+ // proxy for SandyBridge+.
+ if (!Subtarget.hasAVX())
+ return false;
+
+ enum {
+ FuseTest,
+ FuseCmp,
+ FuseInc
+ } FuseKind;
+
+ switch(Second->getOpcode()) {
+ default:
+ return false;
+ case X86::JE_1:
+ case X86::JNE_1:
+ case X86::JL_1:
+ case X86::JLE_1:
+ case X86::JG_1:
+ case X86::JGE_1:
+ FuseKind = FuseInc;
+ break;
+ case X86::JB_1:
+ case X86::JBE_1:
+ case X86::JA_1:
+ case X86::JAE_1:
+ FuseKind = FuseCmp;
+ break;
+ case X86::JS_1:
+ case X86::JNS_1:
+ case X86::JP_1:
+ case X86::JNP_1:
+ case X86::JO_1:
+ case X86::JNO_1:
+ FuseKind = FuseTest;
+ break;
+ }
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ case X86::TEST8ri:
+ case X86::TEST16ri:
+ case X86::TEST32ri:
+ case X86::TEST32i32:
+ case X86::TEST64i32:
+ case X86::TEST64ri32:
+ case X86::TEST8rm:
+ case X86::TEST16rm:
+ case X86::TEST32rm:
+ case X86::TEST64rm:
+ case X86::TEST8ri_NOREX:
+ case X86::AND16i16:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND16rm:
+ case X86::AND16rr:
+ case X86::AND32i32:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND32rm:
+ case X86::AND32rr:
+ case X86::AND64i32:
+ case X86::AND64ri32:
+ case X86::AND64ri8:
+ case X86::AND64rm:
+ case X86::AND64rr:
+ case X86::AND8i8:
+ case X86::AND8ri:
+ case X86::AND8rm:
+ case X86::AND8rr:
+ return true;
+ case X86::CMP16i16:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP16rm:
+ case X86::CMP16rr:
+ case X86::CMP32i32:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP32rm:
+ case X86::CMP32rr:
+ case X86::CMP64i32:
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP64rm:
+ case X86::CMP64rr:
+ case X86::CMP8i8:
+ case X86::CMP8ri:
+ case X86::CMP8rm:
+ case X86::CMP8rr:
+ case X86::ADD16i16:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri8_DB:
+ case X86::ADD16ri_DB:
+ case X86::ADD16rm:
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ case X86::ADD32i32:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri8_DB:
+ case X86::ADD32ri_DB:
+ case X86::ADD32rm:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ case X86::ADD64i32:
+ case X86::ADD64ri32:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8:
+ case X86::ADD64ri8_DB:
+ case X86::ADD64rm:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD8i8:
+ case X86::ADD8mi:
+ case X86::ADD8mr:
+ case X86::ADD8ri:
+ case X86::ADD8rm:
+ case X86::ADD8rr:
+ case X86::SUB16i16:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB16rm:
+ case X86::SUB16rr:
+ case X86::SUB32i32:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB32rm:
+ case X86::SUB32rr:
+ case X86::SUB64i32:
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB64rm:
+ case X86::SUB64rr:
+ case X86::SUB8i8:
+ case X86::SUB8ri:
+ case X86::SUB8rm:
+ case X86::SUB8rr:
+ return FuseKind == FuseCmp || FuseKind == FuseInc;
+ case X86::INC16r:
+ case X86::INC32r:
+ case X86::INC64r:
+ case X86::INC8r:
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::DEC64r:
+ case X86::DEC8r:
+ return FuseKind == FuseInc;
+ }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+ if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+ return true;
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+ // FIXME: Return false for x87 stack register classes for now. We can't
+ // allow any loads of these registers before FpGet_ST0_80.
+ return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+ RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ assert(!Subtarget.is64Bit() &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Create the register. The code to initialize it is inserted
+ // later, by the CGBR pass (below).
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ X86FI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
+ { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
+ { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
+ { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
+ { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
+ { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
+ { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
+ { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
+ { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
+ { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
+ { X86::ORPSrm, X86::ORPDrm, X86::PORrm },
+ { X86::ORPSrr, X86::ORPDrr, X86::PORrr },
+ { X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
+ { X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
+ // AVX 128-bit support
+ { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
+ { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
+ { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
+ { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
+ { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
+ { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+ { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
+ { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
+ { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
+ { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
+ { X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
+ { X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
+ { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
+ { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
+ // AVX 256-bit support
+ { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
+ { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
+ { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
+ { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
+ { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
+ { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
+ { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
+ { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
+ { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
+ { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
+ { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
+ { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
+ { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+ { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
+ { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
+ { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
+ { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
+ { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+ { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+ { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain) {
+ for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+ if (Row[domain-1] == opcode)
+ return Row;
+ return nullptr;
+}
+
+static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
+ for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+ if (Row[domain-1] == opcode)
+ return Row;
+ return nullptr;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+ uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ bool hasAVX2 = Subtarget.hasAVX2();
+ uint16_t validDomains = 0;
+ if (domain && lookup(MI->getOpcode(), domain))
+ validDomains = 0xe;
+ else if (domain && lookupAVX2(MI->getOpcode(), domain))
+ validDomains = hasAVX2 ? 0xe : 0x6;
+ return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+ assert(Domain>0 && Domain<4 && "Invalid execution domain");
+ uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+ const uint16_t *table = lookup(MI->getOpcode(), dom);
+ if (!table) { // try the other table
+ assert((Subtarget.hasAVX2() || Domain < 3) &&
+ "256-bit vector operations only available in AVX2");
+ table = lookupAVX2(MI->getOpcode(), dom);
+ }
+ assert(table && "Cannot change domain");
+ MI->setDesc(get(table[Domain-1]));
+}
+
+/// Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(X86::NOOP);
+}
+
+// This code must remain in sync with getJumpInstrTableEntryBound in this class!
+// In particular, getJumpInstrTableEntryBound must always return an upper bound
+// on the encoding lengths of the instructions generated by
+// getUnconditionalBranch and getTrap.
+void X86InstrInfo::getUnconditionalBranch(
+ MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+ Branch.setOpcode(X86::JMP_1);
+ Branch.addOperand(MCOperand::createExpr(BranchTarget));
+}
+
+// This code must remain in sync with getJumpInstrTableEntryBound in this class!
+// In particular, getJumpInstrTableEntryBound must always return an upper bound
+// on the encoding lengths of the instructions generated by
+// getUnconditionalBranch and getTrap.
+void X86InstrInfo::getTrap(MCInst &MI) const {
+ MI.setOpcode(X86::TRAP);
+}
+
+// See getTrap and getUnconditionalBranch for conditions on the value returned
+// by this function.
+unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
+ // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
+ // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
+ return 5;
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+ switch (opc) {
+ default: return false;
+ case X86::DIVSDrm:
+ case X86::DIVSDrm_Int:
+ case X86::DIVSDrr:
+ case X86::DIVSDrr_Int:
+ case X86::DIVSSrm:
+ case X86::DIVSSrm_Int:
+ case X86::DIVSSrr:
+ case X86::DIVSSrr_Int:
+ case X86::SQRTPDm:
+ case X86::SQRTPDr:
+ case X86::SQRTPSm:
+ case X86::SQRTPSr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSSm:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ // AVX instructions with high latency
+ case X86::VDIVSDrm:
+ case X86::VDIVSDrm_Int:
+ case X86::VDIVSDrr:
+ case X86::VDIVSDrr_Int:
+ case X86::VDIVSSrm:
+ case X86::VDIVSSrm_Int:
+ case X86::VDIVSSrr:
+ case X86::VDIVSSrr_Int:
+ case X86::VSQRTPDm:
+ case X86::VSQRTPDr:
+ case X86::VSQRTPSm:
+ case X86::VSQRTPSr:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTPDZm:
+ case X86::VSQRTPDZr:
+ case X86::VSQRTPSZm:
+ case X86::VSQRTPSZr:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZm:
+ case X86::VDIVSDZrm:
+ case X86::VDIVSDZrr:
+ case X86::VDIVSSZrm:
+ case X86::VDIVSSZrr:
+
+ case X86::VGATHERQPSZrm:
+ case X86::VGATHERQPDZrm:
+ case X86::VGATHERDPDZrm:
+ case X86::VGATHERDPSZrm:
+ case X86::VPGATHERQDZrm:
+ case X86::VPGATHERQQZrm:
+ case X86::VPGATHERDDZrm:
+ case X86::VPGATHERDQZrm:
+ case X86::VSCATTERQPDZmr:
+ case X86::VSCATTERQPSZmr:
+ case X86::VSCATTERDPDZmr:
+ case X86::VSCATTERDPSZmr:
+ case X86::VPSCATTERQDZmr:
+ case X86::VPSCATTERQQZmr:
+ case X86::VPSCATTERDDZmr:
+ case X86::VPSCATTERDQZmr:
+ return true;
+ }
+}
+
+bool X86InstrInfo::
+hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr *DefMI, unsigned DefIdx,
+ const MachineInstr *UseMI, unsigned UseIdx) const {
+ return isHighLatencyDef(DefMI->getOpcode());
+}
+
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const {
+ assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+ "Reassociation needs binary operators");
+
+ // Integer binary math/logic instructions have a third source operand:
+ // the EFLAGS register. That operand must be both defined here and never
+ // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+ // not change anything because rearranging the operands could affect other
+ // instructions that depend on the exact status flags (zero, sign, etc.)
+ // that are set by using these particular operands with this operation.
+ if (Inst.getNumOperands() == 4) {
+ assert(Inst.getOperand(3).isReg() &&
+ Inst.getOperand(3).getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ if (!Inst.getOperand(3).isDead())
+ return false;
+ }
+
+ return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+// 1. Other data types (integer, vectors)
+// 2. Other math / logic operations (xor, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::AND8rr:
+ case X86::AND16rr:
+ case X86::AND32rr:
+ case X86::AND64rr:
+ case X86::OR8rr:
+ case X86::OR16rr:
+ case X86::OR32rr:
+ case X86::OR64rr:
+ case X86::XOR8rr:
+ case X86::XOR16rr:
+ case X86::XOR32rr:
+ case X86::XOR64rr:
+ case X86::IMUL16rr:
+ case X86::IMUL32rr:
+ case X86::IMUL64rr:
+ case X86::PANDrr:
+ case X86::PORrr:
+ case X86::PXORrr:
+ case X86::VPANDrr:
+ case X86::VPANDYrr:
+ case X86::VPORrr:
+ case X86::VPORYrr:
+ case X86::VPXORrr:
+ case X86::VPXORYrr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MAXCPDrr:
+ case X86::MAXCPSrr:
+ case X86::MAXCSDrr:
+ case X86::MAXCSSrr:
+ case X86::MINCPDrr:
+ case X86::MINCPSrr:
+ case X86::MINCSDrr:
+ case X86::MINCSSrr:
+ case X86::VMAXCPDrr:
+ case X86::VMAXCPSrr:
+ case X86::VMAXCPDYrr:
+ case X86::VMAXCPSYrr:
+ case X86::VMAXCSDrr:
+ case X86::VMAXCSSrr:
+ case X86::VMINCPDrr:
+ case X86::VMINCPSrr:
+ case X86::VMINCPDYrr:
+ case X86::VMINCPSYrr:
+ case X86::VMINCSDrr:
+ case X86::VMINCSSrr:
+ return true;
+ case X86::ADDPDrr:
+ case X86::ADDPSrr:
+ case X86::ADDSDrr:
+ case X86::ADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
+ case X86::MULSDrr:
+ case X86::MULSSrr:
+ case X86::VADDPDrr:
+ case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
+ case X86::VADDSDrr:
+ case X86::VADDSSrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
+ case X86::VMULSDrr:
+ case X86::VMULSSrr:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ return false;
+ }
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+ MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const {
+ // Integer instructions define an implicit EFLAGS source register operand as
+ // the third source (fourth total) operand.
+ if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+ return;
+
+ assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+ "Unexpected instruction type for reassociation");
+
+ MachineOperand &OldOp1 = OldMI1.getOperand(3);
+ MachineOperand &OldOp2 = OldMI2.getOperand(3);
+ MachineOperand &NewOp1 = NewMI1.getOperand(3);
+ MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+ assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+ assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+
+ (void)OldOp1;
+ (void)OldOp2;
+
+ assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+
+ // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+ // of this pass or other passes. The EFLAGS operands must be dead in these new
+ // instructions because the EFLAGS operands in the original instructions must
+ // be dead in order for reassociation to occur.
+ NewOp1.setIsDead();
+ NewOp2.setIsDead();
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace X86II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+ {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+ {MO_GOT, "x86-got"},
+ {MO_GOTOFF, "x86-gotoff"},
+ {MO_GOTPCREL, "x86-gotpcrel"},
+ {MO_PLT, "x86-plt"},
+ {MO_TLSGD, "x86-tlsgd"},
+ {MO_TLSLD, "x86-tlsld"},
+ {MO_TLSLDM, "x86-tlsldm"},
+ {MO_GOTTPOFF, "x86-gottpoff"},
+ {MO_INDNTPOFF, "x86-indntpoff"},
+ {MO_TPOFF, "x86-tpoff"},
+ {MO_DTPOFF, "x86-dtpoff"},
+ {MO_NTPOFF, "x86-ntpoff"},
+ {MO_GOTNTPOFF, "x86-gotntpoff"},
+ {MO_DLLIMPORT, "x86-dllimport"},
+ {MO_DARWIN_STUB, "x86-darwin-stub"},
+ {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+ {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+ {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+ {MO_TLVP, "x86-tlvp"},
+ {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+ {MO_SECREL, "x86-secrel"}};
+ return makeArrayRef(TargetFlags);
+}
+
+namespace {
+ /// Create Global Base Reg pass. This initializes the PIC
+ /// global base register for x86-32.
+ struct CGBR : public MachineFunctionPass {
+ static char ID;
+ CGBR() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF.getTarget());
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+
+ // Don't do anything if this is 64-bit as 64-bit PIC
+ // uses RIP relative addressing.
+ if (STI.is64Bit())
+ return false;
+
+ // Only emit a global base reg in PIC mode.
+ if (TM->getRelocationModel() != Reloc::PIC_)
+ return false;
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+ // If we didn't need a GlobalBaseReg, don't insert code.
+ if (GlobalBaseReg == 0)
+ return false;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF.front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ unsigned PC;
+ if (STI.isPICStyleGOT())
+ PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ else
+ PC = GlobalBaseReg;
+
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+ // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+ if (STI.isPICStyleGOT()) {
+ // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ }
+
+ return true;
+ }
+
+ const char *getPassName() const override {
+ return "X86 PIC Global Base Reg Initialization";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+ struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ if (TLSBaseAddrReg)
+ I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+ else
+ I = SetRegister(I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+ I != E; ++I) {
+ Changed |= VisitNode(*I, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+ const bool is64Bit = STI.is64Bit();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+ MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ is64Bit ? X86::RAX : X86::EAX)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I->eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+ const bool is64Bit = STI.is64Bit();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+ ? &X86::GR64RegClass
+ : &X86::GR32RegClass);
+
+ // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+ MachineInstr *Next = I->getNextNode();
+ MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ *TLSBaseAddrReg)
+ .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+ return Copy;
+ }
+
+ const char *getPassName() const override {
+ return "Local Dynamic TLS Access Clean-up";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..edd09d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,571 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+ class X86RegisterInfo;
+ class X86Subtarget;
+
+namespace X86 {
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_A = 0,
+ COND_AE = 1,
+ COND_B = 2,
+ COND_BE = 3,
+ COND_E = 4,
+ COND_G = 5,
+ COND_GE = 6,
+ COND_L = 7,
+ COND_LE = 8,
+ COND_NE = 9,
+ COND_NO = 10,
+ COND_NP = 11,
+ COND_NS = 12,
+ COND_O = 13,
+ COND_P = 14,
+ COND_S = 15,
+ LAST_VALID_COND = COND_S,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches to
+ // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs.
+ COND_NE_OR_P,
+ COND_NP_OR_E,
+
+ COND_INVALID
+ };
+
+ // Turn condition code into conditional branch opcode.
+ unsigned GetCondBranchFromCond(CondCode CC);
+
+ /// \brief Return a set opcode for the given condition and whether it has
+ /// a memory operand.
+ unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+ /// \brief Return a cmov opcode for the given condition, register size in
+ /// bytes, and operand type.
+ unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand = false);
+
+ // Turn CMov opcode into condition code.
+ CondCode getCondFromCMovOpc(unsigned Opc);
+
+ /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+ /// e.g. turning COND_E to COND_NE.
+ CondCode GetOppositeBranchCondition(CondCode CC);
+} // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_DLLIMPORT: // dllimport stub.
+ case X86II::MO_GOTPCREL: // rip-relative GOT reference.
+ case X86II::MO_GOT: // normal GOT reference.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
+ case X86II::MO_GOT: // isPICStyleGOT: other global.
+ case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+ case X86II::MO_TLVP: // ??? Pretty sure..
+ return true;
+ default:
+ return false;
+ }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImm() &&
+ (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+X86::AddrSegmentReg <= MI->getNumOperands() &&
+ MI->getOperand(Op+X86::AddrBaseReg).isReg() &&
+ isScale(MI->getOperand(Op+X86::AddrScaleAmt)) &&
+ MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+ (MI->getOperand(Op+X86::AddrDisp).isImm() ||
+ MI->getOperand(Op+X86::AddrDisp).isGlobal() ||
+ MI->getOperand(Op+X86::AddrDisp).isCPI() ||
+ MI->getOperand(Op+X86::AddrDisp).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+X86::AddrNumOperands <= MI->getNumOperands() &&
+ MI->getOperand(Op+X86::AddrSegmentReg).isReg() &&
+ isLeaMem(MI, Op);
+}
+
+class X86InstrInfo final : public X86GenInstrInfo {
+ X86Subtarget &Subtarget;
+ const X86RegisterInfo RI;
+
+ /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+ /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+ ///
+ typedef DenseMap<unsigned,
+ std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+ RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+ RegOp2MemOpTableType RegOp2MemOpTable0;
+ RegOp2MemOpTableType RegOp2MemOpTable1;
+ RegOp2MemOpTableType RegOp2MemOpTable2;
+ RegOp2MemOpTableType RegOp2MemOpTable3;
+ RegOp2MemOpTableType RegOp2MemOpTable4;
+
+ /// MemOp2RegOpTable - Load / store unfolding opcode map.
+ ///
+ typedef DenseMap<unsigned,
+ std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+ MemOp2RegOpTableType MemOp2RegOpTable;
+
+ static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ unsigned RegOp, unsigned MemOp, unsigned Flags);
+
+ virtual void anchor();
+
+ bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches,
+ bool AllowModify) const;
+
+public:
+ explicit X86InstrInfo(X86Subtarget &STI);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// getSPAdjust - This returns the stack pointer adjustment made by
+ /// this instruction. For x86, we need to handle more complex call
+ /// sequences involving PUSHes.
+ int getSPAdjust(const MachineInstr *MI) const override;
+
+ /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+ /// extension instruction. That is, it's like a copy where it's legal for the
+ /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+ /// true, then it's expected the pre-extension value is available as a subreg
+ /// of the result register. This also returns the sub-register index in
+ /// SubIdx.
+ bool isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const override;
+
+ unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const override;
+ /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const override;
+
+ unsigned isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const override;
+ /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const override;
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+ AliasAnalysis *AA) const override;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig,
+ const TargetRegisterInfo &TRI) const override;
+
+ /// Given an operand within a MachineInstr, insert preceding code to put it
+ /// into the right format for a particular kind of LEA instruction. This may
+ /// involve using an appropriate super-register instead (with an implicit use
+ /// of the original) or creating a new virtual register and inserting COPY
+ /// instructions to get the data into the right class.
+ ///
+ /// Reference parameters are set to indicate how caller should add this
+ /// operand to the LEA instruction.
+ bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+ unsigned LEAOpcode, bool AllowSP,
+ unsigned &NewSrc, bool &isKill,
+ bool &isUndef, MachineOperand &ImplicitOp) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const override;
+
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
+ bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+
+ /// Returns true if the routine could find two commutable operands
+ /// in the given FMA instruction. Otherwise, returns false.
+ ///
+ /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
+ /// The output indices of the commuted operands are returned in these
+ /// arguments. Also, the input values of these arguments may be preset either
+ /// to indices of operands that must be commuted or be equal to a special
+ /// value 'CommuteAnyOperandIndex' which means that the corresponding
+ /// operand index is not set and this method is free to pick any of
+ /// available commutable operands.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
+ /// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+ /// can be interpreted as a query asking if the operand #1 can be swapped
+ /// with any other available operand (e.g. operand #2, operand #3, etc.).
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ bool findFMA3CommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const;
+
+ /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+ /// performs the same computations as the given MI but which has the operands
+ /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+ /// It may return 0 if it is unsafe to commute the operands.
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given \p MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) const;
+
+ // Branch analysis.
+ bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
+ bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+ unsigned &Offset,
+ const TargetRegisterInfo *TRI) const override;
+ bool AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+ TargetInstrInfo::MachineBranchPredicate &MBP,
+ bool AllowModify = false) const override;
+
+ unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ DebugLoc DL) const override;
+ bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
+ unsigned, unsigned, int&, int&, int&) const override;
+ void insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DstReg, ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg) const override;
+ void copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand(s). If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ int FrameIndex) const override;
+
+ /// foldMemoryOperand - Same as the previous version except it allows folding
+ /// of any load and store from / to any address, not just from a specific
+ /// stack slot.
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr *LoadMI) const override;
+
+ /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+ /// a store or a load and a store into two or more instruction. If this is
+ /// possible, returns true as well as the new instructions by reference.
+ bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const override;
+
+ bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const override;
+
+ /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+ /// instruction after load / store are unfolded from an instruction of the
+ /// specified opcode. It returns zero if the specified unfolding is not
+ /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+ /// index of the operand which will hold the register holding the loaded
+ /// value.
+ unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = nullptr) const override;
+
+ /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+ /// to determine if two loads are loading from the same base address. It
+ /// should only return true if the base pointers are the same and the
+ /// only differences between the two addresses are the offset. It also returns
+ /// the offsets by reference.
+ bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+ int64_t &Offset2) const override;
+
+ /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+ /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+ /// be scheduled togther. On some targets if two loads are loading from
+ /// addresses in the same cache line, it's better if they are scheduled
+ /// together. This function takes two integers that represent the load offsets
+ /// from the common base address. It returns true if it decides it's desirable
+ /// to schedule the two loads together. "NumLoads" is the number of loads that
+ /// have already been scheduled after Load1.
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const override;
+
+ bool shouldScheduleAdjacent(MachineInstr* First,
+ MachineInstr *Second) const override;
+
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ bool
+ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+ /// instruction that defines the specified register class.
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+ /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+ /// would clobber the EFLAGS condition register. Note the result may be
+ /// conservative. If it cannot definitely determine the safety after visiting
+ /// a few instructions in each direction it assumes it's not safe.
+ bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ /// True if MI has a condition code def, e.g. EFLAGS, that is
+ /// not marked dead.
+ bool hasLiveCondCodeDef(MachineInstr *MI) const;
+
+ /// getGlobalBaseReg - Return a virtual register initialized with the
+ /// the global base register value. Output instructions required to
+ /// initialize the register in the function entry block, if necessary.
+ ///
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ std::pair<uint16_t, uint16_t>
+ getExecutionDomain(const MachineInstr *MI) const override;
+
+ void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+
+ unsigned
+ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+ unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const override;
+ void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Alignment,
+ bool AllowCommute) const;
+
+ void
+ getUnconditionalBranch(MCInst &Branch,
+ const MCSymbolRefExpr *BranchTarget) const override;
+
+ void getTrap(MCInst &MI) const override;
+
+ unsigned getJumpInstrTableEntryBound() const override;
+
+ bool isHighLatencyDef(int opc) const override;
+
+ bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr *DefMI, unsigned DefIdx,
+ const MachineInstr *UseMI,
+ unsigned UseIdx) const override;
+
+ bool useMachineCombiner() const override {
+ return true;
+ }
+
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+ bool hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const override;
+
+ void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const override;
+
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+
+ /// optimizeCompareInstr - Check if there exists an earlier instruction that
+ /// operates on the same source operands and sets flags in the same way as
+ /// Compare; remove Compare if possible.
+ bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
+
+ /// optimizeLoadInstr - Try to remove the load by folding it to a register
+ /// operand at the use. We fold the load instructions if and only if the
+ /// def and use are in the same BB. We only look at one load and see
+ /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+ /// defined by the load we are trying to fold. DefMI returns the machine
+ /// instruction that defines FoldAsLoadDefReg, and the function returns
+ /// the machine instruction generated due to folding.
+ MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+protected:
+ /// Commutes the operands in the given instruction by changing the operands
+ /// order and/or changing the instruction's opcode and/or the immediate value
+ /// operand.
+ ///
+ /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+ /// to be commuted.
+ ///
+ /// Do not call this method for a non-commutable instruction or
+ /// non-commutable operands.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+ unsigned CommuteOpIdx1,
+ unsigned CommuteOpIdx2) const override;
+
+private:
+ MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ /// Handles memory folding for special case instructions, for instance those
+ /// requiring custom manipulation of the address.
+ MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const;
+
+ /// isFrameOperand - Return true and the FrameIndex if the specified
+ /// operand and follow operands form a reference to the stack frame.
+ bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+ int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..ea8e562
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3085 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+ [SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i8>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+ SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+ SDTCisVT<1, iPTR>,
+ SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+ [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+
+
+def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
+def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
+def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;
+
+def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def X86vastart_save_xmm_regs :
+ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+ SDT_X86VASTART_SAVE_XMM_REGS,
+ [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+ SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad]>;
+
+def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisInt<1>]>>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+ SDTypeProfile<1, 1, [SDTCisInt<0>,
+ SDTCisPtrTy<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>;
+def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>;
+def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+
+def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+ [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+}
+let RenderMethod = "addMemOperands" in {
+ def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; }
+ def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; }
+ def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; }
+ def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; }
+ def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; }
+ def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
+ def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
+ def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
+ // Gather mem operands
+ def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; }
+ def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; }
+ def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; }
+ def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; }
+ def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; }
+ def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; }
+ def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; }
+ def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; }
+ def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; }
+ def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; }
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+ let Name = "AbsMem";
+ let SuperClasses = [X86MemAsmOperand];
+}
+
+class X86MemOperand<string printMethod,
+ AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = parserMatchClass;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Gather mem operands
+class X86VMemOperand<RegisterClass RC, string printMethod,
+ AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm);
+}
+
+def anymem : X86MemOperand<"printanymem">;
+
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>;
+def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>;
+def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>;
+def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
+def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>;
+def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>;
+def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
+
+def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
+
+// Gather mem operands
+def vx32mem : X86VMemOperand<VR128, "printi32mem", X86MemVX32Operand>;
+def vy32mem : X86VMemOperand<VR256, "printi32mem", X86MemVY32Operand>;
+def vx64mem : X86VMemOperand<VR128, "printi64mem", X86MemVX64Operand>;
+def vy64mem : X86VMemOperand<VR256, "printi64mem", X86MemVY64Operand>;
+
+def vx32xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX32XOperand>;
+def vx64xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX64XOperand>;
+def vy32xmem : X86VMemOperand<VR256X, "printi32mem", X86MemVY32XOperand>;
+def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>;
+def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>;
+def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>;
+
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
+ let PrintMethod = "printi8mem";
+ let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86Mem8AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+ let PrintMethod = "printi32mem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+ i32imm, i8imm);
+ let ParserMatchClass = X86Mem32AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+ ptr_rc_tailcall, i32imm, i8imm);
+ let ParserMatchClass = X86Mem64AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+ ParserMatchClass = X86AbsMemAsmOperand,
+ PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+// Special parser to detect 16-bit mode to select 16-bit displacement.
+def X86AbsMem16AsmOperand : AsmOperandClass {
+ let Name = "AbsMem16";
+ let RenderMethod = "addAbsMemOperands";
+ let SuperClasses = [X86AbsMemAsmOperand];
+}
+
+// Branch targets have OtherVT type and print as pc-relative values.
+let OperandType = "OPERAND_PCREL",
+ PrintMethod = "printPCRelImm" in {
+let ParserMatchClass = X86AbsMem16AsmOperand in
+ def brtarget16 : Operand<OtherVT>;
+let ParserMatchClass = X86AbsMemAsmOperand in
+ def brtarget32 : Operand<OtherVT>;
+}
+
+let RenderMethod = "addSrcIdxOperands" in {
+ def X86SrcIdx8Operand : AsmOperandClass {
+ let Name = "SrcIdx8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86SrcIdx16Operand : AsmOperandClass {
+ let Name = "SrcIdx16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86SrcIdx32Operand : AsmOperandClass {
+ let Name = "SrcIdx32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86SrcIdx64Operand : AsmOperandClass {
+ let Name = "SrcIdx64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addSrcIdxOperands"
+
+let RenderMethod = "addDstIdxOperands" in {
+ def X86DstIdx8Operand : AsmOperandClass {
+ let Name = "DstIdx8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86DstIdx16Operand : AsmOperandClass {
+ let Name = "DstIdx16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86DstIdx32Operand : AsmOperandClass {
+ let Name = "DstIdx32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86DstIdx64Operand : AsmOperandClass {
+ let Name = "DstIdx64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addDstIdxOperands"
+
+let RenderMethod = "addMemOffsOperands" in {
+ def X86MemOffs16_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs16_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs16_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs32_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs32_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs32_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs32_64AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+ def X86MemOffs64_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs64_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs64_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs64_64AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addMemOffsOperands"
+
+class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc, i8imm);
+}
+
+class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc);
+}
+
+def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>;
+def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
+def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
+def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
+def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>;
+def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
+def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
+def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
+
+class X86MemOffsOperand<Operand immOperand, string printMethod,
+ AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops immOperand, i8imm);
+}
+
+def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8",
+ X86MemOffs16_8AsmOperand>;
+def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
+ X86MemOffs16_16AsmOperand>;
+def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
+ X86MemOffs16_32AsmOperand>;
+def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8",
+ X86MemOffs32_8AsmOperand>;
+def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
+ X86MemOffs32_16AsmOperand>;
+def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
+ X86MemOffs32_32AsmOperand>;
+def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
+ X86MemOffs32_64AsmOperand>;
+def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8",
+ X86MemOffs64_8AsmOperand>;
+def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
+ X86MemOffs64_16AsmOperand>;
+def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
+ X86MemOffs64_32AsmOperand>;
+def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
+ X86MemOffs64_64AsmOperand>;
+
+def SSECC : Operand<i8> {
+ let PrintMethod = "printSSEAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt3 : ImmLeaf<i8, [{
+ return Imm >= 0 && Imm < 8;
+}]>;
+
+def AVXCC : Operand<i8> {
+ let PrintMethod = "printSSEAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt5 : ImmLeaf<i8, [{
+ return Imm >= 0 && Imm < 32;
+}]>;
+
+def AVX512ICC : Operand<i8> {
+ let PrintMethod = "printSSEAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def XOPCC : Operand<i8> {
+ let PrintMethod = "printXOPCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+ let SuperClasses = [ImmAsmOperand];
+ let RenderMethod = "addImmOperands";
+}
+
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+ let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+ let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+def AVX512RCOperand : AsmOperandClass {
+ let Name = "AVX512RC";
+}
+def AVX512RC : Operand<i32> {
+ let PrintMethod = "printRoundingControl";
+ let OperandType = "OPERAND_IMMEDIATE";
+ let ParserMatchClass = AVX512RCOperand;
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF] |
+// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti16i8";
+ let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti32i8";
+}
+
+// [0, 0x0000007F] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i8";
+ let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+ ImmSExti64i32AsmOperand];
+}
+
+// Unsigned immediate used by SSE/AVX instructions
+// [0, 0xFF]
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmUnsignedi8AsmOperand : AsmOperandClass {
+ let Name = "ImmUnsignedi8";
+ let RenderMethod = "addImmOperands";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16> {
+ let ParserMatchClass = ImmSExti16i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32> {
+ let ParserMatchClass = ImmSExti32i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i32AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 8-bit immediate used by SSE/AVX instructions.
+def u8imm : Operand<i8> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 32-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by some SSE/AVX instructions that use intrinsics.
+def i32u8imm : Operand<i32> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+ let PrintMethod = "printPCRelImm";
+ let ParserMatchClass = X86AbsMemAsmOperand;
+ let OperandType = "OPERAND_PCREL";
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printanymem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printanymem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86-specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex],
+ []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
+ [add, sub, mul, X86mul_imm, shl, or,
+ frameindex, X86WrapperRIP],
+ []>;
+
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex,
+ X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasCMov : Predicate<"Subtarget->hasCMov()">;
+def NoCMov : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
+def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">;
+def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512 : Predicate<"Subtarget->hasAVX512()">,
+ AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI : Predicate<"Subtarget->hasCDI()">,
+ AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasPFI : Predicate<"Subtarget->hasPFI()">,
+ AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
+def HasERI : Predicate<"Subtarget->hasERI()">,
+ AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
+def HasDQI : Predicate<"Subtarget->hasDQI()">,
+ AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
+def NoDQI : Predicate<"!Subtarget->hasDQI()">;
+def HasBWI : Predicate<"Subtarget->hasBWI()">,
+ AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def NoBWI : Predicate<"!Subtarget->hasBWI()">;
+def HasVLX : Predicate<"Subtarget->hasVLX()">,
+ AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
+def NoVLX : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU : Predicate<"!Subtarget->hasPKU()">;
+
+def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES : Predicate<"Subtarget->hasAES()">;
+def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
+def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
+def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP : Predicate<"Subtarget->hasXOP()">;
+def HasTBM : Predicate<"Subtarget->hasTBM()">;
+def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C : Predicate<"Subtarget->hasF16C()">;
+def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM : Predicate<"Subtarget->hasRTM()">;
+def HasHLE : Predicate<"Subtarget->hasHLE()">;
+def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX : Predicate<"Subtarget->hasADX()">;
+def HasSHA : Predicate<"Subtarget->hasSHA()">;
+def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
+def HasMPX : Predicate<"Subtarget->hasMPX()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+ AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">,
+ AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
+def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
+def In16BitMode : Predicate<"Subtarget->is16Bit()">,
+ AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+ AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+def In32BitMode : Predicate<"Subtarget->is32Bit()">,
+ AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
+def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
+def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
+def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+ "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+ "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def OptForSize : Predicate<"OptForSize">;
+def OptForSpeed : Predicate<"!OptForSize">;
+def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
+def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
+def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
+def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
+def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
+def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
+def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
+def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
+def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS : PatLeaf<(i8 12)>;
+def X86_COND_O : PatLeaf<(i8 13)>;
+def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S : PatLeaf<(i8 15)>;
+
+// Predicate used to help when pattern matching LZCNT/TZCNT.
+def X86_COND_E_OR_NE : ImmLeaf<i8, [{
+ return (Imm == X86::COND_E) || (Imm == X86::COND_NE);
+}]>;
+
+
+def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+ return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+ return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let hasSideEffects = 0, SchedRW = [WriteZero] in {
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+ def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+ "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+ def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+ "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+ "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
+def LEAVE : I<0xC9, RawFrm,
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[Not64BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
+def LEAVE64 : I<0xC9, RawFrm,
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG16>, OpSize16;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG>, OpSize16;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+ IIC_POP_MEM>, OpSize16;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
+ IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize16;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize16;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+ "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+ "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[Not64BitMode]>;
+def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+ "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[Not64BitMode]>;
+} // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+ IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+ IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW], Defs = [ESP] in {
+ let Uses = [ESP, EFLAGS] in
+ def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+ Requires<[Not64BitMode]>;
+
+ let Uses = [RSP, EFLAGS] in
+ def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+ [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+ Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW] in {
+ let Defs = [ESP, EFLAGS], Uses = [ESP] in
+ def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_flags_write_u32 GR32:$src)]>,
+ Requires<[Not64BitMode]>;
+
+ let Defs = [RSP, EFLAGS], Uses = [RSP] in
+ def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+ [(int_x86_flags_write_u64 GR64:$src)]>,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+ SchedRW = [WriteLoad] in {
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+ OpSize16;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+ OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+ SchedRW = [WriteStore] in {
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+ OpSize16;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+ OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+ IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
+ IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+ IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+ IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+ IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteStore] in {
+def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
+def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+ OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+ OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+ mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+ OpSize16, Requires<[Not64BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+ mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+ OpSize16, Requires<[Not64BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+ (outs GR32:$dst), (ins GR32:$src),
+ "bswap{l}\t$dst",
+ [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "bswap{q}\t$dst",
+ [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+ IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+ IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
+ IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+ IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs dstidx8:$dst), (ins srcidx8:$src),
+ "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs dstidx16:$dst), (ins srcidx16:$src),
+ "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs dstidx32:$dst), (ins srcidx32:$src),
+ "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs dstidx64:$dst), (ins srcidx64:$src),
+ "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrmDst, (outs dstidx8:$dst), (ins),
+ "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
+ "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
+ "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
+ "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+ "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+ "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+ "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+ "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+ "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
+def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+let isReMaterializable = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "movabs{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, imm:$src)], IIC_MOV>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+
+/// Memory offset versions of moves. The immediate is an address mode sized
+/// offset from the segment base.
+let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
+ "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+ AdSize32;
+let Defs = [AX] in
+def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
+ "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize32;
+let Defs = [EAX] in
+def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
+ "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize32;
+let Defs = [RAX] in
+def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
+ "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+ AdSize32;
+
+let Defs = [AL] in
+def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
+ "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
+let Defs = [AX] in
+def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
+ "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize16;
+let Defs = [EAX] in
+def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
+ "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ AdSize16, OpSize32;
+}
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins),
+ "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
+let Uses = [AX] in
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins),
+ "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize32;
+let Uses = [EAX] in
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins),
+ "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize32;
+let Uses = [RAX] in
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins),
+ "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+ AdSize32;
+
+let Uses = [AL] in
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins),
+ "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
+let Uses = [AX] in
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins),
+ "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize16;
+let Uses = [EAX] in
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins),
+ "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize16;
+}
+}
+
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+ "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64;
+let Defs = [AX] in
+def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+ "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64;
+let Defs = [EAX] in
+def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+ "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
+ AdSize64;
+let Defs = [RAX] in
+def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
+ "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64;
+}
+
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins),
+ "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
+let Uses = [AX] in
+def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins),
+ "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
+let Uses = [EAX] in
+def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins),
+ "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
+ AdSize64;
+let Uses = [RAX] in
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins),
+ "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
+}
+} // hasSideEffects = 0
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let hasSideEffects = 0 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+ (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>,
+ Sched<[WriteMove]>;
+let mayStore = 1, hasSideEffects = 0 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+ (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, hasSideEffects = 0,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+ (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
+ [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+ Requires<[HasLAHFSAHF]>;
+let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
+ IIC_AHF>, // AH = flags
+ Requires<[HasLAHFSAHF]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+ OpSize16, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+ OpSize32, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi16 addr:$src1), GR16:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, OpSize16, TB, Requires<[FastBTMem]>;
+ def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi32 addr:$src1), GR32:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, OpSize32, TB, Requires<[FastBTMem]>;
+ def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi64 addr:$src1), GR64:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+ IIC_BT_RI>, OpSize16, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+ IIC_BT_RI>, OpSize32, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+ IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+ ], IIC_BT_MI>, OpSize16, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+ ], IIC_BT_MI>, OpSize32, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+ i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize16, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize32, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize16, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize32, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize16, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize32, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize16, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize32, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize16, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize32, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize16, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize32, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize16, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize32, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize16, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize32, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize16, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize32, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize16, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize32, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize16, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize32, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize16, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize32, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+ InstrItinClass itin> {
+ let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin>;
+ def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize16;
+ def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>, OpSize32;
+ def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+ itin>;
+ }
+}
+
+defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+ "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+ "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+ OpSize16;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+ "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+ OpSize32;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+ "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+let Uses = [AX], Defs = [AX] in
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+ "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+ OpSize32, Requires<[Not64BitMode]>;
+let Uses = [EAX], Defs = [EAX] in
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+ OpSize32, Requires<[In64BitMode]>;
+let Uses = [RAX], Defs = [RAX] in
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+ "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+ OpSize16;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+ OpSize32;
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+ OpSize16;
+def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+ OpSize32;
+def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB, OpSize16;
+def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB, OpSize32;
+def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB, OpSize16;
+def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB, OpSize32;
+def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+ "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+ "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+ TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>,
+ Requires<[In64BitMode]>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+ "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+ "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+ "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+ "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+ "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+ "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+ "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+def INSB : I<0x6C, RawFrmDst, (outs dstidx8:$dst), (ins),
+ "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+def INSW : I<0x6D, RawFrmDst, (outs dstidx16:$dst), (ins),
+ "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs dstidx32:$dst), (ins),
+ "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+}
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+ Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+ Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX Before Division
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+ "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX After Multiply
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+ "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+ Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Addition
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+ Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+ Requires<[Not64BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+ Requires<[Not64BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+ Requires<[Not64BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+ Requires<[Not64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+ let SchedRW = [WriteALULd] in {
+ def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+ OpSize16, T8PS;
+ def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+ OpSize32, T8PS;
+ def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+ T8PS;
+ }
+ let SchedRW = [WriteStore] in {
+ def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+ OpSize16, T8PS;
+ def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+ OpSize32, T8PS;
+ def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+ T8PS;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+ def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+ "rdrand{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
+ def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+ "rdrand{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
+ def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+ "rdrand{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+ def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+ "rdseed{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
+ def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+ "rdseed{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
+ def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+ "rdseed{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+ def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+ OpSize16;
+ def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize16;
+
+ def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
+ OpSize32;
+ def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize32;
+
+ def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+let Predicates = [HasLZCNT] in {
+ def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
+ (X86cmp GR16:$src, (i16 0))),
+ (LZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
+ (X86cmp GR32:$src, (i32 0))),
+ (LZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
+ (X86cmp GR64:$src, (i64 0))),
+ (LZCNT64rr GR64:$src)>;
+ def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE),
+ (X86cmp GR16:$src, (i16 0))),
+ (LZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE),
+ (X86cmp GR32:$src, (i32 0))),
+ (LZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE),
+ (X86cmp GR64:$src, (i64 0))),
+ (LZCNT64rr GR64:$src)>;
+
+ def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (LZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (LZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (LZCNT64rm addr:$src)>;
+ def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (LZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (LZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (LZCNT64rm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+ OpSize16;
+ def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize16;
+
+ def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
+ OpSize32;
+ def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize32;
+
+ def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+ RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+ []>, T8PS, VEX_4V;
+ let mayLoad = 1 in
+ def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+ []>, T8PS, VEX_4V;
+}
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
+ defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
+ defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
+ defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
+ defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
+ defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBMI] in {
+ // FIXME: patterns for the load versions are not implemented
+ def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+ (BLSR32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+ (BLSR64rr GR64:$src)>;
+
+ def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+ (BLSMSK32rr GR32:$src)>;
+ def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+ (BLSMSK64rr GR64:$src)>;
+
+ def : Pat<(and GR32:$src, (ineg GR32:$src)),
+ (BLSI32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (ineg GR64:$src)),
+ (BLSI64rr GR64:$src)>;
+}
+
+let Predicates = [HasBMI] in {
+ def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
+ (X86cmp GR16:$src, (i16 0))),
+ (TZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
+ (X86cmp GR32:$src, (i32 0))),
+ (TZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
+ (X86cmp GR64:$src, (i64 0))),
+ (TZCNT64rr GR64:$src)>;
+ def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE),
+ (X86cmp GR16:$src, (i16 0))),
+ (TZCNT16rr GR16:$src)>;
+ def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE),
+ (X86cmp GR32:$src, (i32 0))),
+ (TZCNT32rr GR32:$src)>;
+ def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE),
+ (X86cmp GR64:$src, (i64 0))),
+ (TZCNT64rr GR64:$src)>;
+
+ def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (TZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (TZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (TZCNT64rm addr:$src)>;
+ def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
+ (X86cmp (loadi16 addr:$src), (i16 0))),
+ (TZCNT16rm addr:$src)>;
+ def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
+ (X86cmp (loadi32 addr:$src), (i32 0))),
+ (TZCNT32rm addr:$src)>;
+ def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
+ (X86cmp (loadi64 addr:$src), (i64 0))),
+ (TZCNT64rm addr:$src)>;
+}
+
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag> {
+ def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8PS, VEX_4VOp3;
+ def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8PS, VEX_4VOp3;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+ int_x86_bmi_bextr_32, loadi32>;
+ defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+ int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+ defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+ int_x86_bmi_bzhi_32, loadi32>;
+ defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+ int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+ // Count the trailing ones in the immediate.
+ return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+ return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+ def : Pat<(and GR64:$src, BZHIMask:$mask),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+ def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+ (BZHI32rr GR32:$src,
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+ (BZHI32rm addr:$src,
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
+
+let Predicates = [HasBMI] in {
+ def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+ (BEXTR32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+ (BEXTR32rm addr:$src1, GR32:$src2)>;
+ def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+ (BEXTR64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+ (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag> {
+ def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+ VEX_4V;
+ def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+ defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+ int_x86_bmi_pdep_32, loadi32>, T8XD;
+ defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+ int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+ defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+ int_x86_bmi_pext_32, loadi32>, T8XS;
+ defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+ int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ Intrinsic Int, Operand immtype,
+ SDPatternOperator immoperator> {
+ def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+ !strconcat(OpcodeStr,
+ "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+ [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+ XOP, XOPA;
+ def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, immtype:$cntl),
+ !strconcat(OpcodeStr,
+ "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+ XOP, XOPA;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+ int_x86_tbm_bextri_u32, i32imm, imm>;
+let ImmT = Imm32S in
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+ int_x86_tbm_bextri_u64, i64i32imm,
+ i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+ RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+ def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+ []>, XOP_4V, XOP9;
+ let mayLoad = 1 in
+ def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+ []>, XOP_4V, XOP9;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+ Format FormReg, Format FormMem> {
+ defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+ loadi32>;
+ defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+ loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [],
+ IIC_SSE_MONITOR>, TB;
+let Uses = [ECX, EAX, EBX] in
+def MWAITXrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>,
+ TB;
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+ Requires<[Not64BitMode]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let Uses = [EAX] in
+def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+ def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+ (BEXTRI32ri GR32:$src1, imm:$src2)>;
+ def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+ (BEXTRI32mi addr:$src1, imm:$src2)>;
+ def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+ (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+ def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+ (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+ // FIXME: patterns for the load versions are not implemented
+ def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+ (BLCFILL32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+ (BLCFILL64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+ (BLCI64rr GR64:$src)>;
+
+ // Extra patterns because opt can optimize the above patterns to this.
+ def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+ (BLCI64rr GR64:$src)>;
+
+ def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+ (BLCIC32rr GR32:$src)>;
+ def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+ (BLCIC64rr GR64:$src)>;
+
+ def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+ (BLCMSK32rr GR32:$src)>;
+ def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+ (BLCMSK64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+ (BLCS32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+ (BLCS64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+ (BLSFILL32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+ (BLSFILL64rr GR64:$src)>;
+
+ def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+ (BLSIC32rr GR32:$src)>;
+ def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+ (BLSIC64rr GR64:$src)>;
+
+ def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+ (T1MSKC32rr GR32:$src)>;
+ def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+ (T1MSKC64rr GR64:$src)>;
+
+ def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+ (TZMSK32rr GR32:$src)>;
+ def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+ (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions
+//
+
+def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflushopt\t$src", []>, PD;
+def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
+def PCOMMIT : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
+
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+// MPX instructions
+include "X86InstrMPX.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+include "X86InstrSGX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw", "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd", "cwtd", "att">;
+def : MnemonicAlias<"cdq", "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo", "cqto", "att">;
+
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz", "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
+def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+
+// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
+// all modes. However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode. Similar for "pop %bx"
+def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"repe", "rep">;
+def : MnemonicAlias<"repz", "rep">;
+def : MnemonicAlias<"repnz", "repne">;
+
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"sal", "shl", "intel">;
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a", "ud2", "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>;
+def : MnemonicAlias<"sysret", "sysretl", "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz", "fcmove", "att">;
+def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
+def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
+def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
+def : MnemonicAlias<"fcomip", "fcompi">;
+def : MnemonicAlias<"fildq", "fildll", "att">;
+def : MnemonicAlias<"fistpq", "fistpll", "att">;
+def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
+def : MnemonicAlias<"fldcww", "fldcw", "att">;
+def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
+def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
+def : MnemonicAlias<"fucomip", "fucompi">;
+def : MnemonicAlias<"fwait", "wait">;
+
+def : MnemonicAlias<"fxsaveq", "fxsave64", "att">;
+def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">;
+def : MnemonicAlias<"xsaveq", "xsave64", "att">;
+def : MnemonicAlias<"xrstorq", "xrstor64", "att">;
+def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
+def : MnemonicAlias<"xrstorsq", "xrstors64", "att">;
+def : MnemonicAlias<"xsavecq", "xsavec64", "att">;
+def : MnemonicAlias<"xsavesq", "xsaves64", "att">;
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+ string VariantName>
+ : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+ !strconcat(Prefix, NewCond, Suffix), VariantName>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+ string V = ""> {
+ def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
+ def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
+ def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
+ def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
+ def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
+ def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
+ def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
+ def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
+ def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
+ def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp
+
+ def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
+ def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
+ def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
+ def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+ (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+ (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+ (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+ (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+
+// clr aliases.
+def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb $src", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
+def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>;
+def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>;
+def : InstAlias<"fxch", (XCH_F ST1), 0>;
+def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
+def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
+def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
+def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
+def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
+def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
+def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
+ (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+ (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd", ADD_FST0r>;
+defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub", SUB_FST0r>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul", MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi", COM_FIPr>;
+defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw" , (FNSTSW16r)>;
+
+// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall {*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp {*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall {*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp {*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call {*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp {*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call {*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp {*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call {*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
+
+
+// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
+def : InstAlias<"call $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move. All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq {$src, $dst|$dst, $src}",
+ (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq {$src, $dst|$dst, $src}",
+ (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsx aliases
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
+
+/* FIXME: This is disabled because the asm matcher is currently incapable of
+ * matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+ (TEST8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+ (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+ (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+ (TEST64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+ (XCHG8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+ (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+ (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+ (XCHG64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+ (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+ (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..83f9b14
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,674 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+ IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+ IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+ IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+ IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecLogic in
+def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
+ IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+ IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecIMul in {
+def MMX_PSADBW_ITINS : OpndItins<
+ IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+ IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+ IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+ IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+ IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+ IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+ IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+ IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+ IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+ // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ OpndItins itins, bit Commutable = 0> {
+ def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]> {
+ let isCommutable = Commutable;
+ }
+ def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId,
+ Intrinsic IntId2, ShiftOpndItins itins> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[WriteVecShift]>;
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+ (ins VR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
+ Sched<[WriteVecShift]>;
+ }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, OpndItins itins> {
+ def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+
+ def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64 (bitconvert (memopmmx addr:$src))))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, OpndItins itins> {
+ let isCommutable = 0 in
+ def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+ def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+ Sched<[WriteShuffle]>;
+ def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, OpndItins itins, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins DstRC:$src1, SrcRC:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ NoItinerary, d>, Sched<[WriteCvtI2F]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+ [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector GR32:$src)))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+let Predicates = [HasMMX] in {
+ let AddedComplexity = 15 in
+ def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+ (MMX_MOVD64rr GR32:$src)>;
+ let AddedComplexity = 20 in
+ def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+ (MMX_MOVD64rm addr:$src)>;
+}
+
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+ Sched<[WriteStore]>;
+
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (MMX_X86movd2w (x86mmx VR64:$src)))],
+ IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+let isBitcast = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (bitconvert GR64:$src))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove], isBitcast = 1 in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+ (outs GR64:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+let hasSideEffects = 0 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+ (outs i64mem:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))],
+ IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (x86mmx VR64:$src), addr:$dst)],
+ IIC_MMX_MOVQ_RM>;
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (bitconvert
+ (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))))))],
+ IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64
+ (scalar_to_vector
+ (i64 (bitconvert (x86mmx VR64:$src))))))],
+ IIC_MMX_MOVQ_RR>;
+
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let Predicates = [HasSSE1] in
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movntq\t{$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+ IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let Predicates = [HasMMX] in {
+ let AddedComplexity = 15 in
+ // movd to MMX register zero-extends
+ def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+ (MMX_MOVD64rr GR32:$src)>;
+ let AddedComplexity = 20 in
+ def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+ (MMX_MOVD64rm addr:$src)>;
+}
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+ MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+ MMX_INTALU_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+ MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+ MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+ MMX_INTALU_ITINS>;
+let Predicates = [HasSSE2] in
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+ MMX_INTALUQ_ITINS>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+ MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
+ MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE1] in
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+ MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+ MMX_PMUL_ITINS, 1>;
+let isCommutable = 1 in
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+let Predicates = [HasSSE1] in {
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+ MMX_PSADBW_ITINS, 1>;
+}
+
+defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+ MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+ defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+ MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+ MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+ MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+ MMX_INTALU_ITINS_VECLOGICSCHED>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+ MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+ MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+ MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
+ (MMX_PSRADrm VR64:$src1, addr:$src2)>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+ MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
+ int_x86_mmx_punpckhbw,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
+ int_x86_mmx_punpckhwd,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
+ int_x86_mmx_punpckhdq,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
+ int_x86_mmx_punpcklbw,
+ MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
+ int_x86_mmx_punpcklwd,
+ MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+ int_x86_mmx_punpckldq,
+ MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+ MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+ MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+ MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+ MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+ IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+ imm:$src2))],
+ IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+ f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+ f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+ f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+ f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+ i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+let Constraints = "$src1 = $dst" in {
+ defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+ int_x86_sse_cvtpi2ps,
+ i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS;
+}
+
+// Extract / Insert
+let Predicates = [HasSSE1] in
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+ imm:$src2))],
+ IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+let Predicates = [HasSSE1] in {
+ def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst),
+ (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ GR32orGR64:$src2, imm:$src3))],
+ IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+ def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst),
+ (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ (i32 (anyext (loadi16 addr:$src2))),
+ imm:$src3))],
+ IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+
+// Mask creation
+let Predicates = [HasSSE1] in
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR64:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst,
+ (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+ (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+ (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+ IIC_MMX_MASKMOV>;
+let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+ IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
new file mode 100644
index 0000000..31608cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -0,0 +1,70 @@
+//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MPX instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+ def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
+ OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+ Requires<[HasMPX, Not64BitMode]>;
+ def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+ Requires<[HasMPX, In64BitMode]>;
+}
+
+defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
+
+multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+ def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2),
+ OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, Not64BitMode]>;
+ def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2),
+ OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, In64BitMode]>;
+ def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
+ OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, Not64BitMode]>;
+ def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
+ OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[HasMPX, In64BitMode]>;
+}
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+
+def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX]>;
+def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+ "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, In64BitMode]>;
+
+def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX]>;
+def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src),
+ "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
+ "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[HasMPX, In64BitMode]>;
+
+def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+ "bndstx \t{$src, $dst|$dst, $src}", []>, PS,
+ Requires<[HasMPX]>;
+def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ "bndldx \t{$src, $dst|$dst, $src}", []>, PS,
+ Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
new file mode 100644
index 0000000..84119ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
@@ -0,0 +1,24 @@
+//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel SGX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SGX instructions
+
+// ENCLS - Execute an Enclave System Function of Specified Leaf Number
+def ENCLS : I<0x01, MRM_CF, (outs), (ins),
+ "encls", []>, TB;
+
+// ENCLU - Execute an Enclave User Function of Specified Leaf Number
+def ENCLU : I<0x01, MRM_D7, (outs), (ins),
+ "enclu", []>, TB;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..624b931
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,8944 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ // InstrSchedModel info.
+ X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+ OpndItins s = arg_s;
+ OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+ InstrItinClass arg_ri> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+ IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+ IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+ SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+ IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+ IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+ SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+ IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+ IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+ SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+ IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+ IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+ SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+ IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+ IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+ SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+ IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+ IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+ SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+let Sched = WriteVecLogic in
+def SSE_VEC_BIT_ITINS_P : OpndItins<
+ IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+ IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+ IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+ IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+ IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+ IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+ IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+def SSE_DPPD_ITINS : OpndItins<
+ IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+ IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+ IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+ IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+let Sched = WriteMPSAD in
+def SSE_MPSADBW_ITINS : OpndItins<
+ IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+let Sched = WriteVecIMul in
+def SSE_PMULLD_ITINS : OpndItins<
+ IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
+// Definitions for backward compatibility.
+// The instructions mapped on these definitions uses a different itinerary
+// than the actual scheduling model.
+let Sched = WriteShuffle in
+def DEFAULT_ITINS_SHUFFLESCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVecIMul in
+def DEFAULT_ITINS_VECIMULSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteShuffle in
+def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteMPSAD in
+def DEFAULT_ITINS_MPSADSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def DEFAULT_ITINS_FBLENDSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteBlend in
+def DEFAULT_ITINS_BLENDSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVarBlend in
+def DEFAULT_ITINS_VARBLENDSCHED : OpndItins<
+ IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteBlend in
+def SSE_INTALU_ITINS_BLEND_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, X86MemOperand x86memop,
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in {
+ def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ }
+ def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ string asm, string SSEVer, string FPSizeStr,
+ Operand memopr, ComplexPattern mem_cpat,
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
+ def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
+ RC:$src1, RC:$src2))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
+ SSEVer, "_", OpcodeStr, FPSizeStr))
+ RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag,
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+ itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+ string OpcodeStr, X86MemOperand x86memop,
+ list<dag> pat_rr, list<dag> pat_rm,
+ bit Is2Addr = 1> {
+ let isCommutable = 1, hasSideEffects = 0 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rr, NoItinerary, d>,
+ Sched<[WriteVecLogic]>;
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rm, NoItinerary, d>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+ (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+ (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
+ def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+ def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+ [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX] in
+ def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+ def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+ def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+ def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+}
+
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX2] in
+ def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string base_opc,
+ string asm_opr, Domain d = GenericDomain> {
+ def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ (scalar_to_vector RC:$src2))))],
+ IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
+
+ // For the disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string OpcodeStr,
+ Domain d = GenericDomain> {
+ // AVX
+ defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
+ VEX_4V, VEX_LIG;
+
+ def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+ VEX, VEX_LIG, Sched<[WriteStore]>;
+ // SSE1 & 2
+ let Constraints = "$src1 = $dst" in {
+ defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}", d>;
+ }
+
+ def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+ Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_pat, string OpcodeStr,
+ Domain d = GenericDomain> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
+ SSEPackedSingle>, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
+ SSEPackedDouble>, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+ SSEPackedSingle>, XS;
+
+ let AddedComplexity = 20 in
+ defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+ SSEPackedDouble>, XD;
+}
+
+// Patterns
+let Predicates = [UseAVX] in {
+ let AddedComplexity = 20 in {
+ // MOVSSrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+ // MOVSDrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+ def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
+
+ // Shuffle with VMOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VMOVSSrr (v4i32 VR128:$src1),
+ (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VMOVSSrr (v4f32 VR128:$src1),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+ // 256-bit variants
+ def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // Shuffle with VMOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+ // 256-bit variants
+ def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold cause
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+ let Predicates = [NoSSE41], AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm already zeros the high parts of the register.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+ // Shuffle with MOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+ let Predicates = [NoSSE41], AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSD to the lower bits.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSDrm already zeros the high parts of the register.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
+
+ // Shuffle with MOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold because
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, Domain d,
+ OpndItins itins,
+ bit IsReMaterializable = 1> {
+let hasSideEffects = 0 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+ Sched<[WriteFShuffle]>;
+let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+ Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ PS, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ PD, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ PS, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ PD, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ PS, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ PD, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ PS, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ PD, VEX, VEX_L;
+}
+
+let Predicates = [UseSSE1] in {
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ PS;
+}
+let Predicates = [UseSSE2] in {
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ PD;
+}
+
+let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteFShuffle] in {
+ def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+ def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+ def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
+ def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
+ def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+ def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+ def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+ def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
+ (VMOVUPDYmr addr:$dst, VR256:$src)>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteFShuffle] in {
+ def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+ def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+ def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
+ def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+ (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in
+ def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+ def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+ (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX, NoVLX] in {
+ // 128-bit load/store
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (VMOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (VMOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+ // 256-bit load/store
+ def : Pat<(alignedloadv4i64 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv4i64 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+ // Special patterns for storing subvector extracts of lower 128-bits
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ def : Pat<(alignedstore (v2f64 (extract_subvector
+ (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4f32 (extract_subvector
+ (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v2i64 (extract_subvector
+ (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4i32 (extract_subvector
+ (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v8i16 (extract_subvector
+ (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v16i8 (extract_subvector
+ (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+ def : Pat<(store (v2f64 (extract_subvector
+ (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v4f32 (extract_subvector
+ (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v2i64 (extract_subvector
+ (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v4i32 (extract_subvector
+ (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v8i16 (extract_subvector
+ (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v16i8 (extract_subvector
+ (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+let isCodeGenOnly = 1 in {
+ def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+ IIC_SSE_MOVA_P_RM>, VEX;
+ def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+ IIC_SSE_MOVA_P_RM>, VEX;
+ def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+ IIC_SSE_MOVA_P_RM>;
+ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+ IIC_SSE_MOVA_P_RM>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, string asm_opr,
+ InstrItinClass itin> {
+ def PSrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "s", asm_opr),
+ [(set VR128:$dst,
+ (psnode VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+ itin, SSEPackedSingle>, PS,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+ def PDrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "d", asm_opr),
+ [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))],
+ itin, SSEPackedDouble>, PD,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, InstrItinClass itin> {
+ let Predicates = [UseAVX] in
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ itin>, VEX_4V;
+
+ let Constraints = "$src1 = $dst" in
+ defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $dst|$dst, $src2}",
+ itin>;
+}
+
+let AddedComplexity = 20 in {
+ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+ IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+let Predicates = [UseAVX] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
+}// UseAVX
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+ // Shuffle with VMOVLPS
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (VMOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+ // Shuffle with VMOVLPD
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+ (VMOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPDmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+ def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
+ (iPTR 0))), addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+ // Shuffle with MOVLPS
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Shuffle with MOVLPD
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+ defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+ IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+} // UseAVX
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+ // VMOVHPS patterns
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+ // VMOVHPD patterns
+
+ // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
+ // Also handle an i64 load because that may get selected as a faster way to
+ // load the data.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // MOVHPS patterns
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // MOVHPD patterns
+
+ // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+ // Also handle an i64 load because that may get selected as a faster way to
+ // load the data.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20, Predicates = [UseAVX] in {
+ def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
+ VEX_4V, Sched<[WriteFShuffle]>;
+ def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
+ VEX_4V, Sched<[WriteFShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+ def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [UseAVX] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+ (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+ IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+ IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+ IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, OpndItins itins> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (OpNode SrcRC:$src))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ X86MemOperand x86memop, string asm, Domain d,
+ OpndItins itins> {
+let hasSideEffects = 0 in {
+ def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [], itins.rr, d>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ X86MemOperand x86memop, string asm> {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2F]>;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // hasSideEffects = 0
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>,
+ XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+}
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+ XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+ XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+ XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+ XD, VEX_4V, VEX_W, VEX_LIG;
+
+let Predicates = [UseAVX] in {
+ def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+ def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+
+ def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(f32 (sint_to_fp GR32:$src)),
+ (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f32 (sint_to_fp GR64:$src)),
+ (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+ def : Pat<(f64 (sint_to_fp GR32:$src)),
+ (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f64 (sint_to_fp GR64:$src)),
+ (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+ "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+ "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+ string asm, OpndItins itins> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, OpndItins itins,
+ bit Is2Addr = 1> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+ int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+ int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+let isCodeGenOnly = 1 in {
+ let Predicates = [UseAVX] in {
+ defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V;
+ defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V,
+ VEX_W;
+ defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+ SSE_CVT_Scalar, 0>, XD, VEX_4V;
+ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+ SSE_CVT_Scalar, 0>, XD,
+ VEX_4V, VEX_W;
+ }
+ let Constraints = "$src1 = $dst" in {
+ defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32,
+ "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+ defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64,
+ "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+ defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+ "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+ defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+ "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+ }
+} // isCodeGenOnly = 1
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+let isCodeGenOnly = 1 in {
+let Predicates = [UseAVX] in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W;
+}
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ PS, VEX, Requires<[HasAVX]>;
+defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ PS, Requires<[UseSSE2]>;
+
+let Predicates = [UseAVX] in {
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+ Requires<[UseAVX]>;
+
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround FR64:$src))],
+ IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+ IIC_SSE_CVT_Scalar_RM>,
+ XD,
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+ VR128:$src1, sse_load_f64:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+ VR128:$src1, sse_load_f64:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RR>,
+ XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fextend FR32:$src)),
+ (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+ (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+ (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[UseAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+ (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+ Requires<[UseAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (fextend FR32:$src))],
+ IIC_SSE_CVT_Scalar_RR>, XS,
+ Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (extloadf32 addr:$src))],
+ IIC_SSE_CVT_Scalar_RM>, XS,
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64. This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+ (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+ (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
+ Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
+ Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
+ VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+}
+
+def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+ (loadv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ Sched<[WriteCvtF2ILd]>;
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+ (VCVTDQ2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
+ (VCVTDQ2PSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (VCVTDQ2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VCVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (VCVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+ (VCVTTPS2DQrm addr:$src)>;
+
+ def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+ (VCVTDQ2PSYrr VR256:$src)>;
+ def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VCVTDQ2PSYrm addr:$src)>;
+
+ def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+ (VCVTTPS2DQYrr VR256:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+ (VCVTTPS2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (CVTDQ2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (CVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+ (CVTDQ2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+ (CVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (CVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+ (CVTTPS2DQrm addr:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (loadv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+ (VCVTTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+ (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>,
+ Sched<[WriteCvtF2ILd]>;
+
+// Convert packed single to packed double
+let Predicates = [HasAVX] in {
+ // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+let hasSideEffects = 0, mayLoad = 1 in
+def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ []>, VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
+ Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtdq2_pd_256
+ (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
+ Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
+ Sched<[WriteCvtI2F]>;
+}
+
+let hasSideEffects = 0, mayLoad = 1 in
+def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX register conversion intrinsics
+let Predicates = [HasAVX] in {
+ def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
+ (VCVTDQ2PDrr VR128:$src)>;
+ def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VCVTDQ2PDrm addr:$src)>;
+
+ def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+ (VCVTDQ2PDYrr VR128:$src)>;
+ def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+// SSE2 register conversion intrinsics
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
+ (CVTDQ2PDrr VR128:$src)>;
+ def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
+ (CVTDQ2PDrm addr:$src)>;
+} // Predicates = [HasSSE2]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2psx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+
+// YMM only
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+ (VCVTDQ2PSYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
+ (VCVTDQ2PSYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ // Match fround and fextend for 128/256-bit conversions
+ def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ (VCVTPD2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
+ (VCVTPD2PSXrm addr:$src)>;
+ def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+ (VCVTPD2PSYrr VR256:$src)>;
+ def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+ (VCVTPD2PSYrm addr:$src)>;
+
+ def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+ (VCVTPS2PDrr VR128:$src)>;
+ def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+ (VCVTPS2PDYrr VR128:$src)>;
+ def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+ (VCVTPS2PDYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Match fround and fextend for 128 conversions
+ def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ (CVTPD2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+ (CVTPD2PSrm addr:$src)>;
+
+ def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+ (CVTPS2PDrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+ Operand CC, SDNode OpNode, ValueType VT,
+ PatFrag ld_frag, string asm, string asm_alt,
+ OpndItins itins, ImmLeaf immLeaf> {
+ def rr : SIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ [(set RC:$dst, (OpNode (VT RC:$src1),
+ (ld_frag addr:$src2), immLeaf:$cc))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RM>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
+ "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
+ "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
+ XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
+ "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
+ i8immZExt3>, XS;
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
+ "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSE_ALU_F64S, i8immZExt3>, XD;
+}
+
+multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+ Intrinsic Int, string asm, OpndItins itins,
+ ImmLeaf immLeaf> {
+ def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+ [(set VR128:$dst, (Int VR128:$src1,
+ VR128:$src, immLeaf:$cc))],
+ itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+ [(set VR128:$dst, (Int VR128:$src1,
+ (load addr:$src), immLeaf:$cc))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let isCodeGenOnly = 1 in {
+ // Aliases to match intrinsics which expect XMM operand(s).
+ defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+ "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S, i8immZExt5>,
+ XS, VEX_4V;
+ defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+ "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S, i8immZExt5>, // same latency as f32
+ XD, VEX_4V;
+ let Constraints = "$src1 = $dst" in {
+ defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F32S, i8immZExt3>, XS;
+ defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F64S, i8immZExt3>,
+ XD;
+}
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, X86MemOperand x86memop,
+ PatFrag ld_frag, string OpcodeStr> {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR>,
+ Sched<[WriteFAdd]>;
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ (ld_frag addr:$src2)))],
+ IIC_SSE_COMIS_RM>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+ defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+ "ucomiss">, PS, VEX, VEX_LIG;
+ defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+ "ucomisd">, PD, VEX, VEX_LIG;
+ let Pattern = []<dag> in {
+ defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+ "comiss">, PS, VEX, VEX_LIG;
+ defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+ "comisd">, PD, VEX, VEX_LIG;
+ }
+
+ let isCodeGenOnly = 1 in {
+ defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+ load, "ucomiss">, PS, VEX;
+ defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+ load, "ucomisd">, PD, VEX;
+
+ defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+ load, "comiss">, PS, VEX;
+ defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+ load, "comisd">, PD, VEX;
+ }
+ defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+ "ucomiss">, PS;
+ defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+ "ucomisd">, PD;
+
+ let Pattern = []<dag> in {
+ defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+ "comiss">, PS;
+ defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+ "comisd">, PD;
+ }
+
+ let isCodeGenOnly = 1 in {
+ defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+ load, "ucomiss">, PS;
+ defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+ load, "ucomisd">, PD;
+
+ defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+ "comiss">, PS;
+ defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+ "comisd">, PD;
+ }
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+ Operand CC, Intrinsic Int, string asm,
+ string asm_alt, Domain d, ImmLeaf immLeaf,
+ PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+ let isCommutable = 1 in
+ def rri : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+ itins.rr, d>,
+ Sched<[WriteFAdd]>;
+ def rmi : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+ itins.rm, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def rri_alt : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
+ asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+ let mayLoad = 1 in
+ def rmi_alt : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
+ asm_alt, [], itins.rm, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+ }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+ "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+ "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+ "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+ "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+ defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+ "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+ "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
+ defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+ "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+ "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
+ (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
+ (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+ (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
+ (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+ (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
+ (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
+ (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+ ValueType vt, string asm, PatFrag mem_frag,
+ Domain d> {
+ def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+ defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+ defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}
+let Constraints = "$src1 = $dst" in {
+ defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv4f32, SSEPackedSingle>, PS;
+ defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv2f64, SSEPackedDouble>, PD;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
+ (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (loadv2i64 addr:$src2), (i8 imm:$imm))),
+ (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+ // 256-bit patterns
+ def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+ def : Pat<(v8i32 (X86Shufp VR256:$src1,
+ (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+ def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+ def : Pat<(v4i64 (X86Shufp VR256:$src1,
+ (loadv4i64 addr:$src2), (i8 imm:$imm))),
+ (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+ (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Generic SHUFPD patterns
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (memopv2i64 addr:$src2), (i8 imm:$imm))),
+ (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack FP Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+ PatFrag mem_frag, RegisterClass RC,
+ X86MemOperand x86memop, string asm,
+ Domain d> {
+ def rr : PI<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1, RC:$src2)))],
+ IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
+ def rm : PI<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1,
+ (mem_frag addr:$src2))))],
+ IIC_SSE_UNPCK, d>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+ VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+ VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+ VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+ VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}// Predicates = [HasAVX, NoVLX]
+let Constraints = "$src1 = $dst" in {
+ defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS;
+ defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, PD;
+ defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS;
+ defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, PD;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
+ (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
+ (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
+ Domain d> {
+ def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+ Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+ "movmskps", SSEPackedSingle>, PS, VEX;
+ defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+ "movmskpd", SSEPackedDouble>, PD, VEX;
+ defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+ "movmskps", SSEPackedSingle>, PS,
+ VEX, VEX_L;
+ defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+ "movmskpd", SSEPackedDouble>, PD,
+ VEX, VEX_L;
+
+ def : Pat<(i32 (X86fgetsign FR32:$src)),
+ (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(i64 (X86fgetsign FR32:$src)),
+ (SUBREG_TO_REG (i64 0),
+ (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
+ def : Pat<(i32 (X86fgetsign FR64:$src)),
+ (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(i64 (X86fgetsign FR64:$src)),
+ (SUBREG_TO_REG (i64 0),
+ (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+ SSEPackedSingle>, PS;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+ SSEPackedDouble>, PD;
+
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+ (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
+ Requires<[UseSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+ (SUBREG_TO_REG (i64 0),
+ (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
+ Requires<[UseSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+ (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
+ Requires<[UseSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+ (SUBREG_TO_REG (i64 0),
+ (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
+ Requires<[UseSSE2]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, OpndItins itins,
+ bit IsCommutable, bit Is2Addr> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+ ValueType OpVT128, ValueType OpVT256,
+ OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+ VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+ memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+ OpVT256, VR256, loadv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+ SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+// Multiclass for scalars using the X86 logical operation aliases for FP.
+multiclass sse12_fp_packed_scalar_logical_alias<
+ bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
+ PS, VEX_4V;
+
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
+ PD, VEX_4V;
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+ f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
+
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+ f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
+ }
+}
+
+let isCodeGenOnly = 1 in {
+ defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
+ SSE_BIT_ITINS_P>;
+ defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
+ SSE_BIT_ITINS_P>;
+ defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
+ SSE_BIT_ITINS_P>;
+
+ let isCommutable = 0 in
+ defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
+ SSE_BIT_ITINS_P>;
+}
+
+// Multiclass for vectors using the X86 logical operation aliases for FP.
+multiclass sse12_fp_packed_vector_logical_alias<
+ bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
+ let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
+ PS, VEX_4V;
+
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
+ PD, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+ PS, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+ PD, VEX_4V, VEX_L;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+ v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
+ PS;
+
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+ v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
+ PD;
+ }
+}
+
+let isCodeGenOnly = 1 in {
+ defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
+ SSE_BIT_ITINS_P>;
+ defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
+ SSE_BIT_ITINS_P>;
+ defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
+ SSE_BIT_ITINS_P>;
+
+ let isCommutable = 0 in
+ defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
+ SSE_BIT_ITINS_P>;
+}
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasAVX, NoVLX] in {
+ defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f256mem,
+ [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f256mem,
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (bc_v4i64 (v4f64 VR256:$src2))))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (loadv4i64 addr:$src2)))], 0>,
+ PD, VEX_4V, VEX_L;
+
+ // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+ // are all promoted to v2i64, and the patterns are covered by the int
+ // version. This is needed in SSE only, because v2i64 isn't supported on
+ // SSE1, but only on SSE2.
+ defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem, [],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+
+ defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (loadv2i64 addr:$src2)))], 0>,
+ PD, VEX_4V;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem,
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>, PS;
+
+ defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>, PD;
+ }
+}
+
+defm AND : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+ defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SizeItins itins> {
+ let Predicates = [HasAVX, NoVLX] in {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR128, v4f32, f128mem, loadv4f32,
+ SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR128, v2f64, f128mem, loadv2f64,
+ SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, VR256, v8f32, f256mem, loadv8f32,
+ SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, VR256, v4f64, f256mem, loadv4f64,
+ SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+ v4f32, f128mem, memopv4f32, SSEPackedSingle,
+ itins.s>, PS;
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+ v2f64, f128mem, memopv2f64, SSEPackedDouble,
+ itins.d>, PD;
+ }
+}
+
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins> {
+ defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
+ XS, VEX_4V, VEX_LIG;
+ defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
+ XD, VEX_4V, VEX_LIG;
+
+ let Constraints = "$src1 = $dst" in {
+ defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem, SSEPackedSingle,
+ itins.s>, XS;
+ defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem, SSEPackedDouble,
+ itins.d>, XD;
+ }
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SizeItins itins> {
+ defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+ !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+ defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+ !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+
+ let Constraints = "$src1 = $dst" in {
+ defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+ !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ SSEPackedSingle, itins.s>, XS;
+ defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+ !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ SSEPackedDouble, itins.d>, XD;
+ }
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+ basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+ defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+ defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+}
+
+let isCodeGenOnly = 1 in {
+ defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+ defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// A[0] += B[0];
+// return A;
+// }
+//
+// Previously we generated:
+// addss %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// __m128 C = A + B;
+// return (__m128) {c[0], a[1], a[2], a[3]};
+// }
+//
+// Previously we generated:
+// addps %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [UseSSE1] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movsd, so match that too.
+ let Predicates = [UseSSE41] in {
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
+
+ }
+
+ // Repeat everything for AVX, except for the movss + scalar combo...
+ // because that one shouldn't occur with AVX codegen?
+ let Predicates = [HasAVX] in {
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+ }
+}
+
+defm : scalar_math_f32_patterns<fadd, "ADD">;
+defm : scalar_math_f32_patterns<fsub, "SUB">;
+defm : scalar_math_f32_patterns<fmul, "MUL">;
+defm : scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [UseSSE2] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movsd, so match those too.
+ let Predicates = [UseSSE41] in {
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+
+ // Repeat everything for AVX.
+ let Predicates = [HasAVX] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+}
+
+defm : scalar_math_f64_patterns<fadd, "ADD">;
+defm : scalar_math_f64_patterns<fsub, "SUB">;
+defm : scalar_math_f64_patterns<fmul, "MUL">;
+defm : scalar_math_f64_patterns<fdiv, "DIV">;
+
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTPS : OpndItins<
+ IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+ IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+ IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+ IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
+>;
+}
+
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+ IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+ IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+ IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+ IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse_fp_unop_s - SSE1 unops in scalar form
+/// For the non-AVX defs, we need $src1 to be tied to $dst because
+/// the HW instructions are 2 operand / destructive.
+multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType vt, ValueType ScalarVT,
+ X86MemOperand x86memop, Operand vec_memop,
+ ComplexPattern mem_cpat, Intrinsic Intr,
+ SDNode OpNode, Domain d, OpndItins itins,
+ Predicate target, string Suffix> {
+ let hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+ [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
+ Requires<[target]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+ [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>,
+ Requires<[target, OptForSize]>;
+
+ let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+ def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ let mayLoad = 1 in
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+ }
+
+ let Predicates = [target] in {
+ def : Pat<(vt (OpNode mem_cpat:$src)),
+ (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
+ (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
+ // These are unary operations, but they are modeled as having 2 source operands
+ // because the high elements of the destination are unchanged in SSE.
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+ def : Pat<(Intr (load addr:$src)),
+ (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
+ addr:$src), VR128))>;
+ }
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // movss mem, %xmm0
+ // rcpss %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // rcpss mem, %xmm0
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr mem_cpat:$src),
+ (!cast<Instruction>(NAME#Suffix##m_Int)
+ (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ }
+}
+
+multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType vt, ValueType ScalarVT,
+ X86MemOperand x86memop, Operand vec_memop,
+ ComplexPattern mem_cpat,
+ Intrinsic Intr, SDNode OpNode, Domain d,
+ OpndItins itins, string Suffix> {
+ let hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [], itins.rr, d>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ let isCodeGenOnly = 1 in {
+ def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[itins.Sched.Folded]>;
+ let mayLoad = 1 in
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, vec_memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+ }
+
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // vmovss mem, %xmm0
+ // vrcpss %xmm0, %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // vrcpss mem, %xmm0, %xmm0
+ // TODO: In theory, we could fold the load, and avoid the stall caused by
+ // the partial register store, either in ExeDepFix or with smarter RA.
+ let Predicates = [UseAVX] in {
+ def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
+ (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
+ }
+ let Predicates = [HasAVX] in {
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
+ VR128:$src)>;
+ }
+ let Predicates = [HasAVX, OptForSize] in {
+ def : Pat<(Intr mem_cpat:$src),
+ (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+ (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ }
+ let Predicates = [UseAVX, OptForSize] in {
+ def : Pat<(ScalarVT (OpNode (load addr:$src))),
+ (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+ addr:$src)>;
+ def : Pat<(vt (OpNode mem_cpat:$src)),
+ (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
+ mem_cpat:$src)>;
+ }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, list<Predicate> prds> {
+let Predicates = prds in {
+ def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+ ssmem, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+ SSEPackedSingle, itins, UseSSE1, "SS">, XS;
+ defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
+ f32mem, ssmem, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+ SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+}
+
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+ sdmem, sse_load_f64,
+ !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+ OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
+ defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
+ f64mem, sdmem, sse_load_f64,
+ !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+ OpNode, SSEPackedDouble, itins, "SD">,
+ XD, VEX_4V, VEX_LIG;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// TODO: We should add *scalar* op patterns for these just like we have for
+// the binops above. If the binop and unop patterns could all be unified
+// that would be even better.
+
+multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
+ SDNode Move, ValueType VT,
+ Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+ (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movs*, so match that too.
+ let Predicates = [UseSSE41] in {
+ def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+ (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [HasAVX] in {
+ def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+ (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+
+ def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+}
+
+defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
+ v2f64, UseSSE2>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+let Predicates = [HasAVX, NoVLX] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2f64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v8f32 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4i64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+}
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti{l}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
+ PS, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movnti{q}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
+ PS, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should flush instruction be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+ IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteNop] in {
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins),
+ "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
+ OBXS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteFence] in {
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+ "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+ PS, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+ "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+ TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+ "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+ TB, Requires<[HasSSE2]>;
+} // SchedRW
+
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+let Predicates = [UseSSE1] in {
+def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let hasSideEffects = 0, SchedRW = [WriteMove] in {
+def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX, VEX_L;
+def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteMove] in {
+def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>,
+ VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>,
+ VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX, VEX_L;
+let Predicates = [HasAVX] in {
+ def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX;
+ def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+ IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+ IIC_SSE_MOVU_P_RM>,
+ XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVU_P_MR>,
+ XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
+ def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+ (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+ (MOVDQUmr addr:$dst, VR128:$src)>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+ IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0,
+ bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+ Intrinsic IntId256, OpndItins itins,
+ bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+ defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
+ VR128, loadv2i64, i128mem, itins,
+ IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
+ i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+ defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
+ VR256, loadv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+ PatFrag ld_frag, ShiftOpndItins itins,
+ bit Is2Addr = 1> {
+ // src2 is always 128-bit
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+ itins.rr>, Sched<[WriteVecShift]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (bc_frag (ld_frag addr:$src2)))))], itins.rm>,
+ Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
+ Sched<[WriteVecShift]>;
+}
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0, bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1, NoVLX>;
+defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0, NoVLX>;
+defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+
+// Intrinsic forms
+defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
+ int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
+defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
+ int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
+defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
+ int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
+ int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
+ int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
+ int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
+defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
+ int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+
+let Predicates = [HasAVX] in
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+ loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
+
+let Predicates = [HasAVX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+ VR256, loadv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+ VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+ VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+ VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+ VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+ VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX]
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
+ Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
+ // 128-bit logical shifts.
+ def VPSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
+ VEX_4V;
+ def VPSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
+ VEX_4V;
+ // PSRADQri doesn't exist in SSE[1-3].
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+
+let Predicates = [HasAVX2, NoVLX] in {
+defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+ VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+ VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+ VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+ VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+ VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX]
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
+ Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ // 256-bit logical shifts.
+ def VPSLLDQYri : PDIi8<0x73, MRM7r,
+ (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
+ "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR256:$dst,
+ (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
+ VEX_4V, VEX_L;
+ def VPSRLDQYri : PDIi8<0x73, MRM3r,
+ (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
+ "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR256:$dst,
+ (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
+ VEX_4V, VEX_L;
+ // PSRADQYri doesn't exist in SSE[1-3].
+} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+
+let Constraints = "$src1 = $dst" in {
+defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
+ // 128-bit logical shifts.
+ def PSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ "pslldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_INTSHDQ_P_RI>;
+ def PSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ "psrldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_INTSHDQ_P_RI>;
+ // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Constraints = "$src1 = $dst"
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1, NoVLX>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0, NoVLX>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+ SDNode OpNode> {
+let Predicates = [HasAVX] in {
+ def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+ def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2] in {
+ def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+ def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+ def ri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
+ def mi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
+ (VPSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (PSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode VR128:$src1,
+ (bc_frag (ld_frag addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+ def Yrr : PDI<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : PDI<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode VR256:$src1,
+ (bc_frag (loadv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : SS48I<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS48I<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (OutVT (OpNode VR128:$src1,
+ (bc_frag (ld_frag addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+ def Yrr : SS48I<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : SS48I<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OutVT (OpNode VR256:$src1,
+ (bc_frag (loadv4i64 addr:$src2)))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+ bc_v8i16, loadv2i64, 0>, VEX_4V;
+ defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+ bc_v4i32, loadv2i64, 0>, VEX_4V;
+
+ defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+ bc_v8i16, loadv2i64, 0>, VEX_4V;
+ defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+ bc_v4i32, loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
+ bc_v8i32>, VEX_4V, VEX_L;
+
+ defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
+ bc_v8i32>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+ bc_v8i16, memopv2i64>;
+ defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+ bc_v4i32, memopv2i64>;
+
+ defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+ bc_v8i16, memopv2i64>;
+
+ let Predicates = [HasSSE41] in
+ defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+ bc_v4i32, memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+ IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (OpNode VR128:$src1,
+ (bc_frag (ld_frag addr:$src2))))],
+ IIC_SSE_UNPCK>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, PatFrag bc_frag> {
+ def Yrr : PDI<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : PDI<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (bc_frag (loadv4i64 addr:$src2))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+ bc_v16i8, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+ bc_v8i16, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+ bc_v16i8, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+ bc_v8i16, loadv2i64, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+ bc_v4i32, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+ bc_v2i64, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+ bc_v4i32, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+ bc_v2i64, loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
+ bc_v32i8>, VEX_4V, VEX_L;
+ defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
+ bc_v32i8>, VEX_4V, VEX_L;
+ defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
+ bc_v16i16>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+ bc_v8i32>, VEX_4V, VEX_L;
+ defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+ bc_v4i64>, VEX_4V, VEX_L;
+ defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
+ bc_v8i32>, VEX_4V, VEX_L;
+ defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
+ bc_v4i64>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+ bc_v16i8, memopv2i64>;
+ defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+ bc_v8i16, memopv2i64>;
+ defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+ bc_v4i32, memopv2i64>;
+ defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+ bc_v2i64, memopv2i64>;
+
+ defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+ bc_v16i8, memopv2i64>;
+ defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+ bc_v8i16, memopv2i64>;
+ defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+ bc_v4i32, memopv2i64>;
+ defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+ bc_v2i64, memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+ def rri : Ii8<0xC4, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ GR32orGR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+ IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
+ def rmi : Ii8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+ imm:$src3))], IIC_SSE_PINSRW>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX, NoBWI] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+ "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))]>, PD, VEX,
+ Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))], IIC_SSE_PEXTRW>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX, NoBWI] in
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, PD;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+ IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR256:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+ VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+ IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>;
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword/Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ Sched<[WriteMove]>;
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+ def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+ def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+ def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+ def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+ Sched<[WriteMove]>;
+def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteStore]>;
+def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+ Sched<[WriteMove]>;
+def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+ (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>,
+ VEX;
+
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let isCodeGenOnly = 1 in {
+ let Predicates = [UseAVX] in
+ def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ VEX, Sched<[WriteLoad]>;
+ def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+ def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+ def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+ def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+ def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+ def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+ def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+ def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
+
+let Predicates = [UseAVX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIrr GR64:$src)>;
+
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
+ }
+ // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
+ // These instructions also write zeros in the high part of a 256-bit register.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+ }
+ // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (MOVDI2PDIrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (MOV64toPQIrr GR64:$src)>;
+ }
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>;
+ }
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ VEX, Requires<[UseAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+ IIC_SSE_MOVDQ>, XS,
+ Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // ExeDomain, SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>;
+} // ExeDomain, SchedRW
+
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteVecLogic] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+}
+
+//===---------------------------------------------------------------------===//
+// Store / copy lower 64-bits of a XMM register.
+//
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>;
+
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
+def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+ (loadi64 addr:$src))))))],
+ IIC_SSE_MOVDQ>,
+ XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+ (loadi64 addr:$src))))))],
+ IIC_SSE_MOVDQ>,
+ XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+} // ExeDomain, isCodeGenOnly, AddedComplexity
+
+let Predicates = [UseAVX], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (VMOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (MOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i64 (alignedX86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
+ XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
+ XS, Requires<[UseSSE2]>;
+} // ExeDomain, SchedRW
+
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ XS, Requires<[UseSSE2]>;
+}
+} // ExeDomain, isCodeGenOnly, SchedRW
+
+let AddedComplexity = 20 in {
+ let Predicates = [UseAVX] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (VMOVZPQILo2PQIrr VR128:$src)>;
+ }
+ let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>;
+ }
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+ ValueType vt, RegisterClass RC, PatFrag mem_frag,
+ X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (vt (OpNode RC:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v4f32, VR128, loadv4f32, f128mem>, VEX;
+ defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v4f32, VR128, loadv4f32, f128mem>, VEX;
+ defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+ defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+ memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+ memopv4f32, f128mem>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (VMOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (VMOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVSLDUPrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+ (VMOVSHDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VMOVSHDUPYrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+ (VMOVSLDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (MOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (MOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (v2f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))],
+ IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+ Sched<[WriteFShuffle]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
+ Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
+ defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+ // 256-bit version
+ def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+ (VMOVDDUPYrm addr:$src)>;
+ def : Pat<(X86Movddup (v4i64 VR256:$src)),
+ (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+}
+
+let Predicates = [UseAVX, OptForSize] in {
+ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+ def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+ def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+ def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+ VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "lddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+ IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, OpndItins itins,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : I<0xD0, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : I<0xD0, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+ f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+ defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+ f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+ f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+ defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+ f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+ }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+ let ExeDomain = SSEPackedSingle in
+ defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+ f128mem, SSE_ALU_F32P, memopv4f32>, XD;
+ let ExeDomain = SSEPackedDouble in
+ defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+ f128mem, SSE_ALU_F64P, memopv2f64>, PD;
+}
+
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+ (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
+ (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+ (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
+ (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+
+ def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
+ (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
+ (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
+ def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
+ (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
+ (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+ (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
+ (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+ (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
+ (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
+
+ def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
+
+ def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+ X86fhadd, loadv4f32, 0>, VEX_4V;
+ defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+ X86fhsub, loadv4f32, 0>, VEX_4V;
+ defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+ X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+ defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+ X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+ X86fhadd, loadv2f64, 0>, VEX_4V;
+ defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+ X86fhsub, loadv2f64, 0>, VEX_4V;
+ defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+ X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+ defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+ X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+ }
+}
+
+let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in {
+ defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
+ memopv4f32>;
+ defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
+ memopv4f32>;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
+ memopv2f64>;
+ defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
+ memopv2f64>;
+ }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+ PatFrag ld_frag> {
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
+ Sched<[WriteVecALU]>;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>,
+ Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256> {
+ def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId256 VR256:$src))]>,
+ Sched<[WriteVecALU]>;
+
+ def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (IntId256
+ (bitconvert (loadv4i64 addr:$src))))]>,
+ Sched<[WriteVecALULd]>;
+}
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+ VR128:$src))>;
+def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+ VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
+
+let Predicates = [HasAVX] in {
+ defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128,
+ loadv2i64>, VEX;
+ defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128,
+ loadv2i64>, VEX;
+ defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128,
+ loadv2i64>, VEX;
+
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (VPABSBrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (VPABSWrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (VPABSDrr128 VR128:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb",
+ int_x86_avx2_pabs_b>, VEX, VEX_L;
+ defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw",
+ int_x86_avx2_pabs_w>, VEX, VEX_L;
+ defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd",
+ int_x86_avx2_pabs_d>, VEX, VEX_L;
+
+ def : Pat<(xor
+ (bc_v4i64 (v32i1sextv32i8)),
+ (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+ (VPABSBrr256 VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v16i1sextv16i16)),
+ (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+ (VPABSWrr256 VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v8i1sextv8i32)),
+ (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+ (VPABSDrr256 VR256:$src)>;
+}
+
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128,
+ memopv2i64>;
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128,
+ memopv2i64>;
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128,
+ memopv2i64>;
+
+let Predicates = [HasSSSE3] in {
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (PABSBrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (PABSWrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (PABSDrr128 VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+ IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+ IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+ IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+ IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+ IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+ IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, OpndItins itins,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, OpndItins itins,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ Sched<[itins.Sched]>;
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (ld_frag addr:$src2))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256,
+ X86FoldableSchedWrite Sched> {
+ let isCommutable = 1 in
+ def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+ Sched<[Sched]>;
+ def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+ Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+ loadv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
+ loadv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
+ loadv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
+ loadv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
+ loadv2i64, i128mem,
+ SSE_PSHUFB, 0>, VEX_4V;
+ defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+ defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+ defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw_128,
+ SSE_PMADD, loadv2i64, 0>, VEX_4V;
+}
+defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
+ int_x86_ssse3_pmul_hr_sw_128,
+ SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
+ loadv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
+ loadv4i64, i256mem,
+ SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+ defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+ int_x86_avx2_phadd_sw,
+ WriteVecALU>, VEX_4V, VEX_L;
+ defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+ int_x86_avx2_phsub_sw,
+ WriteVecALU>, VEX_4V, VEX_L;
+ defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
+ int_x86_avx2_pmadd_ub_sw,
+ WriteVecIMul>, VEX_4V, VEX_L;
+}
+defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
+ int_x86_avx2_pmul_hr_sw,
+ WriteVecIMul>, VEX_4V, VEX_L;
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSHUFB>;
+ defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW, memopv2i64>;
+ defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW, memopv2i64>;
+ defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw_128,
+ SSE_PMADD, memopv2i64>;
+}
+defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw_128,
+ SSE_PMULHRSW, memopv2i64>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+ let hasSideEffects = 0 in {
+ def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
+ let mayLoad = 1 in
+ def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+ let hasSideEffects = 0 in {
+ def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteShuffle]>;
+ let mayLoad = 1 in
+ def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ }
+}
+
+let Predicates = [HasAVX] in
+ defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+ defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+ defm PALIGN : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+ [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+ Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+ TB, Requires<[HasSSE3]>;
+let Uses = [ECX, EAX] in
+def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+ TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+ Requires<[Not64BitMode]>;
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+ RegisterClass OutRC, RegisterClass InRC,
+ OpndItins itins> {
+ def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [], itins.rr>,
+ Sched<[itins.Sched]>;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
+ X86MemOperand MemOp, X86MemOperand MemYOp,
+ OpndItins SSEItins, OpndItins AVXItins,
+ OpndItins AVX2Itins> {
+ defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+ let Predicates = [HasAVX, NoVLX] in
+ defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
+ VR128, VR128, AVXItins>, VEX;
+ let Predicates = [HasAVX2, NoVLX] in
+ defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
+ VR256, VR128, AVX2Itins>, VEX, VEX_L;
+}
+
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand MemOp, X86MemOperand MemYOp> {
+ defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
+ MemOp, MemYOp,
+ SSE_INTALU_ITINS_SHUFF_P,
+ DEFAULT_ITINS_SHUFFLESCHED,
+ DEFAULT_ITINS_SHUFFLESCHED>;
+ defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
+ !strconcat("pmovzx", OpcodeStr),
+ MemOp, MemYOp,
+ SSE_INTALU_ITINS_SHUFF_P,
+ DEFAULT_ITINS_SHUFFLESCHED,
+ DEFAULT_ITINS_SHUFFLESCHED>;
+}
+
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>;
+
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>;
+
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>;
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
+ // Register-Register patterns
+ def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+
+ // On AVX2, we also support 256bit inputs.
+ def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))),
+ (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))),
+ (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))),
+ (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))),
+ (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))),
+ (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))),
+ (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ // Simple Register-Memory patterns
+ def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+
+ // AVX2 Register-Memory patterns
+ def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
+ defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
+}
+
+// SSE4.1/AVX patterns.
+multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
+ SDNode ExtOp, PatFrag ExtLoad16> {
+ def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
+
+ def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+ def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+ defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+}
+
+let Predicates = [UseSSE41] in {
+ defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
+ defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+ imm:$src2))]>,
+ Sched<[WriteShuffle]>;
+ let hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
+ imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+
+defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteShuffle]>;
+
+ let hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+ Sched<[WriteShuffle]>;
+ let SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+ addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR64:$dst,
+ (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+ Sched<[WriteShuffle]>, REX_W;
+ let SchedRW = [WriteShuffleLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+ addr:$dst)]>, REX_W;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+ OpndItins itins = DEFAULT_ITINS> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+ itins.rr>, Sched<[WriteFBlend]>;
+ let SchedRW = [WriteFBlendLd, WriteRMW] in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)], itins.rm>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [UseAVX] in
+ defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+ defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+ imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+ imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+ Sched<[WriteShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+ imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+ defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+ OpndItins itins = DEFAULT_ITINS> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+ Sched<[WriteFShuffle]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insertps VR128:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))], itins.rm>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [UseAVX] in
+ defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+ let Constraints = "$src1 = $dst" in
+ defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+}
+
+let Predicates = [UseSSE41] in {
+ // If we're inserting an element from a load or a null pshuf of a load,
+ // fold the load into the insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+ (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+ imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+ (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+ // If we're inserting an element from a vbroadcast of a load, fold the
+ // load into the X86insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ PatFrag mem_frag32, PatFrag mem_frag64,
+ Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+ // Intrinsic operation, reg.
+ // Vector intrinsic operation, reg
+ def PSr : SS4AIi8<opcps, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+ IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+ // Vector intrinsic operation, mem
+ def PSm : SS4AIi8<opcps, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+ IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+ // Vector intrinsic operation, reg
+ def PDr : SS4AIi8<opcpd, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+ IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+ // Vector intrinsic operation, mem
+ def PDm : SS4AIi8<opcpd, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+ IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain in {
+ // Operation, reg.
+ let hasSideEffects = 0 in
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ []>, Sched<[WriteFAdd]>;
+
+ // Intrinsic operation, reg.
+ let isCodeGenOnly = 1 in
+ def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteFAdd]>;
+
+ // Intrinsic operation, mem.
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ // Operation, reg.
+ let hasSideEffects = 0 in
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ []>, Sched<[WriteFAdd]>;
+
+ // Intrinsic operation, reg.
+ let isCodeGenOnly = 1 in
+ def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteFAdd]>;
+
+ // Intrinsic operation, mem.
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+ // Intrinsic form
+ defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
+ loadv4f32, loadv2f64,
+ int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>, VEX;
+ defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
+ loadv8f32, loadv4f64,
+ int_x86_avx_round_ps_256,
+ int_x86_avx_round_pd_256>, VEX, VEX_L;
+ defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
+ int_x86_sse41_round_ss,
+ int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+}
+
+let Predicates = [UseAVX] in {
+ def : Pat<(ffloor FR32:$src),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ def : Pat<(f32 (fnearbyint FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ def : Pat<(f32 (frint FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+ def : Pat<(f64 (ftrunc FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (ffloor VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x9))>;
+ def : Pat<(v4f32 (fnearbyint VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xA))>;
+ def : Pat<(v4f32 (frint VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xB))>;
+
+ def : Pat<(v2f64 (ffloor VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x9))>;
+ def : Pat<(v2f64 (fnearbyint VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xA))>;
+ def : Pat<(v2f64 (frint VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xB))>;
+
+ def : Pat<(v8f32 (ffloor VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x9))>;
+ def : Pat<(v8f32 (fnearbyint VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xC))>;
+ def : Pat<(v8f32 (fceil VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xA))>;
+ def : Pat<(v8f32 (frint VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x4))>;
+ def : Pat<(v8f32 (ftrunc VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xB))>;
+
+ def : Pat<(v4f64 (ffloor VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x9))>;
+ def : Pat<(v4f64 (fnearbyint VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xC))>;
+ def : Pat<(v4f64 (fceil VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xA))>;
+ def : Pat<(v4f64 (frint VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x4))>;
+ def : Pat<(v4f64 (ftrunc VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xB))>;
+}
+
+defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
+ memopv4f32, memopv2f64,
+ int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+let Constraints = "$src1 = $dst" in
+defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+ int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(ffloor FR32:$src),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ def : Pat<(f32 (fnearbyint FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ def : Pat<(f32 (frint FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+ def : Pat<(f64 (ftrunc FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+
+ def : Pat<(v4f32 (ffloor VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x9))>;
+ def : Pat<(v4f32 (fnearbyint VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xA))>;
+ def : Pat<(v4f32 (frint VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xB))>;
+
+ def : Pat<(v2f64 (ffloor VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x9))>;
+ def : Pat<(v2f64 (fnearbyint VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xA))>;
+ def : Pat<(v2f64 (frint VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xB))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ Sched<[WriteVecLogic]>, VEX;
+def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+ Sched<[WriteVecLogic]>, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ Sched<[WriteVecLogic]>;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+ def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+ Sched<[WriteVecLogic]>, VEX;
+ def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
+ VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
+ VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+ def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+ IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+ OpSize16, XS;
+ def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+ (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+ Sched<[WriteFAddLd]>, OpSize16, XS;
+
+ def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+ IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+ OpSize32, XS;
+
+ def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+ (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+ Sched<[WriteFAddLd]>, OpSize32, XS;
+
+ def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+ IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+ def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+ (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+ Sched<[WriteFAddLd]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, PatFrag ld_frag,
+ X86FoldableSchedWrite Sched> {
+ def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ Sched<[Sched]>;
+ def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128 (bitconvert (ld_frag addr:$src))))]>,
+ Sched<[Sched.Folded]>;
+}
+
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+ int_x86_sse41_phminposuw, loadv2i64,
+ WriteVecIMul>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+ int_x86_sse41_phminposuw, memopv2i64,
+ WriteVecIMul>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1,
+ OpndItins itins = SSE_INTALU_ITINS_P> {
+ let isCommutable = 1 in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0, bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+ defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+ VR128, loadv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+ defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+ VR256, loadv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+ defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+ VEX_4V;
+ defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+ defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+ loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+ VEX_4V, VEX_L;
+ defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+ loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+ VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+ defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1,
+ OpndItins itins = DEFAULT_ITINS> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (IntId RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1,
+ OpndItins itins = DEFAULT_ITINS> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let isCommutable = 0 in {
+ defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+ VR128, loadv2i64, i128mem, 0,
+ DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+ }
+
+ let ExeDomain = SSEPackedSingle in {
+ defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+ VR128, loadv4f32, f128mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+ defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+ VR256, loadv8f32, f256mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+ VR128, loadv2f64, f128mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+ defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+ VR256, loadv4f64, f256mem, 0,
+ DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+ }
+ defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+ VR128, loadv2i64, i128mem, 0,
+ DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+ VR128, loadv4f32, f128mem, 0,
+ SSE_DPPS_ITINS>, VEX_4V;
+ let ExeDomain = SSEPackedDouble in
+ defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+ VR128, loadv2f64, f128mem, 0,
+ SSE_DPPS_ITINS>, VEX_4V;
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+ VR256, loadv8f32, i256mem, 0,
+ SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in {
+ defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+ VR256, loadv4i64, i256mem, 0,
+ DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+ }
+ defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+ VR256, loadv4i64, i256mem, 0,
+ DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 0 in {
+ defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+ VR128, memopv2i64, i128mem,
+ 1, SSE_MPSADBW_ITINS>;
+ }
+ let ExeDomain = SSEPackedSingle in
+ defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
+ VR128, memopv4f32, f128mem,
+ 1, SSE_INTALU_ITINS_FBLEND_P>;
+ let ExeDomain = SSEPackedDouble in
+ defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+ VR128, memopv2f64, f128mem,
+ 1, SSE_INTALU_ITINS_FBLEND_P>;
+ defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+ VR128, memopv2i64, i128mem,
+ 1, SSE_INTALU_ITINS_BLEND_P>;
+ let ExeDomain = SSEPackedSingle in
+ defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+ VR128, memopv4f32, f128mem, 1,
+ SSE_DPPS_ITINS>;
+ let ExeDomain = SSEPackedDouble in
+ defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+ VR128, memopv2f64, f128mem, 1,
+ SSE_DPPD_ITINS>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_frag, Intrinsic IntId,
+ X86FoldableSchedWrite Sched> {
+ def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+ Sched<[Sched]>;
+
+ def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+ RC:$src3))],
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+ Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+ loadv2f64, int_x86_sse41_blendvpd,
+ WriteFVarBlend>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+ loadv4f64, int_x86_avx_blendv_pd_256,
+ WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+ loadv4f32, int_x86_sse41_blendvps,
+ WriteFVarBlend>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+ loadv8f32, int_x86_avx_blendv_ps_256,
+ WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+ loadv2i64, int_x86_sse41_pblendvb,
+ WriteVarBlend>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+ loadv4i64, int_x86_avx2_pblendvb,
+ WriteVarBlend>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+ (v16i8 VR128:$src2))),
+ (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+ (v4f32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+ (v2f64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+ (v8f32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+ (v4f64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+ (v32i8 VR256:$src2))),
+ (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+// Patterns
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseAVX] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVS{S,D} to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+ (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+
+ // Move low f64 and clear high bits.
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
+ }
+
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0),
+ (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+ sub_xmm)>;
+
+ // These will incur an FP/int domain crossing penalty, but it may be the only
+ // way without AVX2. Do not add any complexity because we may be able to match
+ // more optimal patterns defined earlier in this file.
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+}
+
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41] in {
+ // With SSE41 we can use blends for these patterns.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
+}
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+ multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ X86MemOperand x86memop, Intrinsic IntId,
+ OpndItins itins = DEFAULT_ITINS> {
+ def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+ itins.rr>, Sched<[itins.Sched]>;
+
+ def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1,
+ (bitconvert (mem_frag addr:$src2)), XMM0))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+ int_x86_sse41_blendvpd,
+ DEFAULT_ITINS_FBLENDSCHED>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+ int_x86_sse41_blendvps,
+ DEFAULT_ITINS_FBLENDSCHED>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+ int_x86_sse41_pblendvb,
+ DEFAULT_ITINS_VARBLENDSCHED>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+ (v16i8 VR128:$src2))),
+ (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+ (v4f32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+ (v2f64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+ VEX;
+let Predicates = [HasAVX2] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+ VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
+ def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+ def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+}
+
+let Predicates = [HasAVX] in
+ defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+ loadv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+ defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+ loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+ defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+ memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+ imm:$src3))]>;
+ def MEM : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+ (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+ def rr : SS42AI<0x62, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrM]>;
+ let mayLoad = 1 in
+ def rm :SS42AI<0x62, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+ VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+ (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+ def rr : SS42AI<0x60, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrM]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x60, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+ (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+ def rr : SS42AI<0x63, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrI]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x63, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+ defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
+ imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
+ Requires<[HasAVX]>;
+ defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
+ Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+ def rr : SS42AI<0x61, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrI]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x61, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+ defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+ RegisterClass RCIn, SDPatternOperator Int> :
+ SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+ !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+ [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
+ Sched<[WriteFAdd]>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+ X86MemOperand x86memop, SDPatternOperator Int> :
+ SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+ !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+ [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+ IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+ def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+ int_x86_sse42_crc32_32_8>;
+ def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+ int_x86_sse42_crc32_32_8>;
+ def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+ int_x86_sse42_crc32_32_16>, OpSize16;
+ def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+ int_x86_sse42_crc32_32_16>, OpSize16;
+ def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+ int_x86_sse42_crc32_32_32>, OpSize32;
+ def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+ int_x86_sse42_crc32_32_32>, OpSize32;
+ def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+ int_x86_sse42_crc32_64_64>, REX_W;
+ def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+ int_x86_sse42_crc32_64_64>, REX_W;
+ let hasSideEffects = 0 in {
+ let mayLoad = 1 in
+ def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+ null_frag>, REX_W;
+ def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+ null_frag>, REX_W;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+ bit UsesXMM0 = 0> {
+ def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [!if(UsesXMM0,
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+ def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [!if(UsesXMM0,
+ (set VR128:$dst, (IntId VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+ (set VR128:$dst, (IntId VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+ def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+ (i8 imm:$src3)))]>, TA;
+ def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_sha1rnds4 VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)),
+ (i8 imm:$src3)))]>, TA;
+
+ defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+ defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+ defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+ let Uses=[XMM0] in
+ defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+ defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+ defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
+}
+
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+ (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ Sched<[WriteAESDecEnc]>;
+ def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
+ Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+ defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
+ int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+ defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
+ int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+ defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
+ int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+ defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
+ int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
+ int_x86_aesni_aesenc, memopv2i64>;
+ defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
+ int_x86_aesni_aesenclast, memopv2i64>;
+ defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
+ int_x86_aesni_aesdec, memopv2i64>;
+ defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
+ int_x86_aesni_aesdeclast, memopv2i64>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+ def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+ VEX;
+ def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+ Sched<[WriteAESIMCLd]>, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+ Sched<[WriteAESIMCLd]>;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+ def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ Sched<[WriteAESKeyGen]>, VEX;
+ def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+ Sched<[WriteAESKeyGenLd]>, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ Sched<[WriteAESKeyGen]>;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+ Sched<[WriteAESKeyGenLd]>;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+let isCommutable = 1 in
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+ Sched<[WriteCLMul]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (loadv2i64 addr:$src2), imm:$src3))]>,
+ Sched<[WriteCLMulLd, ReadAfterLd]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+ IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (memopv2i64 addr:$src2), imm:$src3))],
+ IIC_SSE_PCLMULQDQ_RM>,
+ Sched<[WriteCLMulLd, ReadAfterLd]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+ (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
+
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+ (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
+
+ def : InstAlias<!strconcat("vpclmul", asm,
+ "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+ 0>;
+
+ def : InstAlias<!strconcat("vpclmul", asm,
+ "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+ 0>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
+ (ins VR128:$src, u8imm:$len, u8imm:$idx),
+ "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+ [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
+ imm:$idx))]>, PD;
+def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "extrq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+ VR128:$mask))]>, PD;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
+ "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+ [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+ imm:$len, imm:$idx))]>, XD;
+def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "insertq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+ VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+ "movntss\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movntsd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag ld_frag, SchedWrite Sched> :
+ AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[Sched]>, VEX {
+ let mayLoad = 1;
+}
+
+// AVX2 adds register forms
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
+ AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+ Sched<[Sched]>, VEX;
+
+let ExeDomain = SSEPackedSingle in {
+ def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
+ f32mem, v4f32, loadf32, WriteLoad>;
+ def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
+ f32mem, v8f32, loadf32,
+ WriteFShuffleLd>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
+ v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+
+let ExeDomain = SSEPackedSingle in {
+ def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+ v4f32, v4f32, WriteFShuffle>;
+ def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+ v8f32, v4f32, WriteFShuffle256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+ v4f64, v2f64, WriteFShuffle256>, VEX_L;
+
+let mayLoad = 1, Predicates = [HasAVX2] in
+def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
+ (ins i128mem:$src),
+ "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteLoad]>, VEX, VEX_L;
+
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+ (ins f128mem:$src),
+ "vbroadcastf128\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
+ Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+ (VBROADCASTF128 addr:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+ (bc_v4i32 (loadv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+ (bc_v16i8 (loadv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+ (bc_v8i16 (loadv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, Sched<[WriteStore]>, VEX, VEX_L;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v4f32 (VEXTRACTF128rr
+ (v8f32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v2f64 (VEXTRACTF128rr
+ (v4f64 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v2i64 (VEXTRACTF128rr
+ (v4i64 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v4i32 (VEXTRACTF128rr
+ (v8i32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v8i16 (VEXTRACTF128rr
+ (v16i16 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v16i8 (VEXTRACTF128rr
+ (v32i8 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+ Intrinsic IntLd, Intrinsic IntLd256,
+ Intrinsic IntSt, Intrinsic IntSt256> {
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+ VEX_4V;
+ def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L;
+ def mr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+ int_x86_avx_maskload_ps,
+ int_x86_avx_maskload_ps_256,
+ int_x86_avx_maskstore_ps,
+ int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+ int_x86_avx_maskload_pd,
+ int_x86_avx_maskload_pd_256,
+ int_x86_avx_maskstore_pd,
+ int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop_f,
+ X86MemOperand x86memop_i, PatFrag i_frag,
+ Intrinsic IntVar, ValueType vt> {
+ def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
+ Sched<[WriteFShuffle]>;
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop_i:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (IntVar RC:$src1,
+ (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+ let Predicates = [HasAVX, NoVLX] in {
+ def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+ Sched<[WriteFShuffle]>;
+ def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop_f:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+ Sched<[WriteFShuffleLd]>;
+ }// Predicates = [HasAVX, NoVLX]
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+ loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+ defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+ loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+ loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+ defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+ loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+ (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+ (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+ (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
+def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+ (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+ (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
+ (i8 imm:$imm))),
+ (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+ (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+ (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+ (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+ (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
+ (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+ Sched<[WriteFShuffle]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
+ (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+ (loadv4f64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+ (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+ (loadv4i64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+ (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+ // Zero All YMM registers
+ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+ [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+ // Zero Upper bits of YMM registers
+ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+ [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+ def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (Int VR128:$src))]>,
+ T8PD, VEX, Sched<[WriteCvtF2F]>;
+ let hasSideEffects = 0, mayLoad = 1 in
+ def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
+ Sched<[WriteCvtF2FLd]>;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+ def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+ (ins RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+ TAPD, VEX, Sched<[WriteCvtF2F]>;
+ let hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteCvtF2FLd, WriteRMW] in
+ def mr : Ii8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ TAPD, VEX;
+}
+
+let Predicates = [HasF16C] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+ (VCVTPH2PSrm addr:$src)>;
+ def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+ (VCVTPH2PSrm addr:$src)>;
+
+ def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
+ (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+ addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
+ (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+ addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
+ addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+}
+
+// Patterns for matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C] in {
+ def : Pat<(fp_to_f16 FR32:$src),
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+ (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
+
+ def : Pat<(f16_to_fp GR16:$src),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+ (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+ def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+ (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop> {
+ let isCommutable = 1 in
+ def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ Sched<[WriteBlend]>, VEX_4V;
+ def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+ Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+}
+
+defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+ VR128, loadv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+ VR256, loadv4i64, i256mem>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+ let Predicates = [HasAVX2, prd] in {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+ Sched<[WriteShuffle]>, VEX;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[WriteLoad]>, VEX;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+ Sched<[WriteShuffle256]>, VEX, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[WriteLoad]>, VEX, VEX_L;
+
+ // Provide aliases for broadcast from the same register class that
+ // automatically does the extract.
+ def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+ (!cast<Instruction>(NAME#"Yrr")
+ (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+ }
+}
+
+defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+ v16i8, v32i8, NoVLX_Or_NoBWI>;
+defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+ v8i16, v16i16, NoVLX_Or_NoBWI>;
+defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+ v4i32, v8i32, NoVLX>;
+defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+ v2i64, v4i64, NoVLX>;
+
+let Predicates = [HasAVX2] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+
+ // Provide aliases for broadcast from the same register class that
+ // automatically does the extract.
+ def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
+ (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
+ sub_xmm)))>;
+ def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
+ (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
+ sub_xmm)))>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+ def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+ (VPBROADCASTBrr (COPY_TO_REGCLASS
+ (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ VR128))>;
+ def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+ (VPBROADCASTBYrr (COPY_TO_REGCLASS
+ (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ VR128))>;
+
+ def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+ (VPBROADCASTWrr (COPY_TO_REGCLASS
+ (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+ VR128))>;
+ def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+ (VPBROADCASTWYrr (COPY_TO_REGCLASS
+ (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+ VR128))>;
+
+ // The patterns for VPBROADCASTD are not needed because they would match
+ // the exact same thing as VBROADCASTSS patterns.
+
+ def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+ (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+ // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+ }
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ // 128bit broadcasts:
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+ }
+
+ def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+ (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2i64 (X86VBroadcast i64:$src)),
+ (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT, X86FoldableSchedWrite Sched> {
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+ Sched<[Sched]>, VEX_4V, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1,
+ (bitconvert (mem_frag addr:$src2)))))]>,
+ Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT, X86FoldableSchedWrite Sched> {
+ def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+ Sched<[Sched]>, VEX, VEX_L;
+ def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi (mem_frag addr:$src1),
+ (i8 imm:$src2))))]>,
+ Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
+ WriteShuffle256>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
+ WriteFShuffle256>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+ VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
+ (i8 imm:$src3)))]>,
+ Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
+ (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
+ (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let hasSideEffects = 0 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+ (bc_v4i32 (loadv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+ (bc_v16i8 (loadv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+ (bc_v8i16 (loadv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[WriteShuffle256]>, VEX, VEX_L;
+let hasSideEffects = 0, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[WriteStore]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v2i64 (VEXTRACTI128rr
+ (v4i64 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v4i32 (VEXTRACTI128rr
+ (v8i32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v8i16 (VEXTRACTI128rr
+ (v16i16 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v16i8 (VEXTRACTI128rr
+ (v32i8 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+ Intrinsic IntLd128, Intrinsic IntLd256,
+ Intrinsic IntSt128, Intrinsic IntSt256> {
+ def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+ def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L;
+ def mr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+ int_x86_avx2_maskload_d,
+ int_x86_avx2_maskload_d_256,
+ int_x86_avx2_maskstore_d,
+ int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+ int_x86_avx2_maskload_q,
+ int_x86_avx2_maskload_q_256,
+ int_x86_avx2_maskstore_q,
+ int_x86_avx2_maskstore_q_256>, VEX_W;
+
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
+ (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
+ (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
+ (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
+ (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
+ (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
+ (bc_v8f32 (v8i32 immAllZerosV)))),
+ (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
+ (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
+ VR256:$mask)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
+ (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
+ (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
+ (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
+ VR256:$mask)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
+ (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
+ (bc_v4f32 (v4i32 immAllZerosV)))),
+ (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
+ (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
+ VR128:$mask)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
+ (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
+ (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
+ (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
+ VR128:$mask)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
+ (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
+ (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
+ (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
+ (v4f64 immAllZerosV))),
+ (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
+ (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
+ VR256:$mask)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
+ (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
+ (bc_v4i64 (v8i32 immAllZerosV)))),
+ (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
+ (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
+ VR256:$mask)>;
+
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
+ (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
+ (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
+ (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
+ (v2f64 immAllZerosV))),
+ (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
+ (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
+ VR128:$mask)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
+ (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
+ (bc_v2i64 (v4i32 immAllZerosV)))),
+ (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
+ (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
+ VR128:$mask)>;
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, ValueType vt256> {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+ VEX_4V, Sched<[WriteVarVecShift]>;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1,
+ (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+ VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+ VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1,
+ (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+ VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+ defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+ defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+}
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+ X86MemOperand memop128, X86MemOperand memop256> {
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+ (ins VR128:$src1, memop128:$src2, VR128:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX_4VOp3;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+ (ins RC256:$src1, memop256:$src2, RC256:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX_4VOp3, VEX_L;
+}
+
+let mayLoad = 1, Constraints
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+
+ let ExeDomain = SSEPackedDouble in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+ }
+
+ let ExeDomain = SSEPackedSingle in {
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+
+def : Pat<(loadf128 addr:$src),
+ (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 0000000..c847be7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 0000000..c1df978
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,969 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shl{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "shl{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "shl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+ OpSize16;
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "shl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+ OpSize32;
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "shl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+} // isConvertibleToThreeAddress = 1
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+ "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize16;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize32;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "shl{b}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "shl{w}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "shl{l}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "shl{q}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t$dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t$dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t$dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t$dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shr{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
+ "shr{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "shr{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "shr{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
+ "shr{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+ "shr{b}\t$dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t$dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t$dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t$dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize16;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize32;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "shr{b}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "shr{w}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "shr{l}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "shr{q}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t$dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t$dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t$dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t$dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (sra GR8:$src1, CL))],
+ IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (sra GR16:$src1, CL))],
+ IIC_SR>, OpSize16;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (sra GR32:$src1, CL))],
+ IIC_SR>, OpSize32;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (sra GR64:$src1, CL))],
+ IIC_SR>;
+}
+
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "sar{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "sar{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "sar{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "sar{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t$dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t$dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize16;
+def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t$dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+ IIC_SR>, OpSize32;
+def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "sar{b}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "sar{w}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "sar{l}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "sar{q}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t$dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t$dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t$dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t$dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "rol{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "rol{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "rol{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "rol{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize16;
+def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+ IIC_SR>, OpSize32;
+def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
+ "rol{b}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
+ "rol{w}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
+ "rol{l}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
+ "rol{q}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+
+// Rotate by 1
+def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t$dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t$dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t$dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t$dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "ror{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "ror{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize16;
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "ror{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize32;
+def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "ror{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t$dst",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t$dst",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize16;
+def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t$dst",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+ IIC_SR>, OpSize32;
+def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t$dst",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "ror{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "ror{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "ror{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "ror{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Rotate by 1
+def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t$dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t$dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize16;
+def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t$dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>, OpSize32;
+def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t$dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
+ TB, OpSize16;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
+ TB, OpSize16;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
+ TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
+ TB;
+}
+
+let isCommutable = 1 in { // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ TB, OpSize16;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ TB, OpSize16;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ TB, OpSize32;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ TB, OpSize32;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
+ TB, OpSize16;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
+ TB, OpSize16;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
+ TB, OpSize32;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
+ TB, OpSize32;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
+ TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+ return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def ROT64L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+ return getI8Imm(64 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, u8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX_4VOp3, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX_4VOp3,
+ Sched<[WriteShiftLd,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src1
+ ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+ defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+ defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+ defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+ defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+ defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+ defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+ defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
+
+ // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+ let AddedComplexity = 10 in {
+ def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+ }
+
+ def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+ // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+ // immedidate shift, i.e. the following code is considered better
+ //
+ // mov %edi, %esi
+ // shl $imm, %esi
+ // ... %edi, ...
+ //
+ // than
+ //
+ // movb $imm, %sil
+ // shlx %sil, %edi, %esi
+ // ... %edi, ...
+ //
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, GR8:$src2),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, GR8:$src2),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, GR8:$src2),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, GR8:$src2),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, GR8:$src2),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, GR8:$src2),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+ //
+ // mov (%ecx), %esi
+ // shl $imm, $esi
+ //
+ // over
+ //
+ // movb $imm %al
+ // shlx %al, (%ecx), %esi
+ //
+ // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+ // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 0000000..a97d1e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,615 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes. These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+ def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+ TB;
+
+let Defs = [RAX, RCX, RDX] in
+ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+
+// CPU flow control instructions
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+ def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+ def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+ def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+ [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
+ [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+ Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
+ IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+ (INT3)>, Requires<[NotPS4]>;
+def : Pat<(debugtrap),
+ (INT (i8 0x41))>, Requires<[IsPS4]>;
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr : I<0xEC, RawFrm, (outs), (ins),
+ "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+ "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize16;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+ "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+
+let Defs = [AL] in
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
+ "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+ "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+ "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+
+let Uses = [DX, AL] in
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
+ "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+
+let Uses = [AL] in
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
+ "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+ "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+ "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+ Requires<[In64BitMode]>;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+ Requires<[In64BitMode]>;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
+def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+ OpSize16;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+ OpSize16;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+ OpSize32;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+ OpSize32;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+ OpSize16;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+ OpSize16;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+ OpSize32;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+ OpSize32;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+ [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+ "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+ "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+ "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+ "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
+
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+ "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+ "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+ "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+ "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+ "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+ "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+ "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+ "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+ "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+ "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+ "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+ "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+ "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+ "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+ "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+ "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+ "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+ "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+ "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+ "pop{l}\t{%es|es}", [], IIC_POP_SR>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+ "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+ "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+ "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+ "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+ "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+ "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs opaque80mem:$dst), (ins),
+ "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+ "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+ "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs opaque80mem:$dst), (ins),
+ "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+ "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+// extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
+ "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
+ "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+ "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+ "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+let Defs = [EAX, EDX], Uses = [ECX] in
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+ def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+ TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+ "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+ "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+ "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+ "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+ "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+ "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
+let Defs = [EDX, EAX], Uses = [ECX] in
+ def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [EDX, EAX, ECX] in
+ def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+}
+
+let Uses = [EDX, EAX] in {
+let Predicates = [HasXSAVE] in {
+ def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave\t$dst",
+ [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
+ def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave64\t$dst",
+ [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+ def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor\t$dst",
+ [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
+ def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor64\t$dst",
+ [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEOPT] in {
+ def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt\t$dst",
+ [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
+ def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt64\t$dst",
+ [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEC] in {
+ def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec\t$dst",
+ [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
+ def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec64\t$dst",
+ [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVES] in {
+ def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves\t$dst",
+ [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
+ def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves64\t$dst",
+ [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+ def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+ "xrstors\t$dst",
+ [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
+ def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+ "xrstors64\t$dst",
+ [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+} // Uses
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+ def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+ def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
+ def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
+ def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
+ def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
+ def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+ def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
+ def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+ def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+//==-----------------------------------------------------------------------===//
+// PKU - enable protection key
+let usesCustomInserter = 1 in {
+ def WRPKRU : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_wrpkru GR32:$src)]>;
+ def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_rdpkru))]>;
+}
+
+let Defs = [EAX, EDX], Uses = [ECX] in
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+let Uses = [EAX, ECX, EDX] in
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+ def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+ "rdfsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+ def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+ "rdfsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+ def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+ "rdgsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+ def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+ "rdgsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+ def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+ "wrfsbase{l}\t$src",
+ [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+ def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+ "wrfsbase{q}\t$src",
+ [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+ def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+ "wrgsbase{l}\t$src",
+ [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+ def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+ "wrgsbase{q}\t$src",
+ [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS] in {
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// SMX Instruction
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
+ def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 0000000..7267d75
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,50 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+ "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+ Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
+def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
+ "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
+ "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+}
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+ "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+ "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+ "xabort\t$imm",
+ [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+let isAsmParserOnly = 1 in {
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 0000000..79afe9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmclear\t$vmcs", []>, PD;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmptrld\t$vmcs", []>, PS;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+ "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+ "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 0000000..4cb2304
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,344 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
+ defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
+ defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
+ defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
+ defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
+ defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
+ defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
+ defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
+ defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
+ defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
+ defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
+ defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
+ defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
+ defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
+ defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ Operand memop, ComplexPattern mem_cpat> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
+ def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+ ssmem, sse_load_f32>;
+ defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
+ defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+ sdmem, sse_load_f64>;
+ defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
+ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+ XOP_4VOp3, Sched<[WriteVarVecShift]>;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+ XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+ (vt128 VR128:$src2))))]>,
+ XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
+ defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
+ defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
+ defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+ defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+ defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+ defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ let isCommutable = 1 in
+ def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+ def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+ VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+ defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+ defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+ defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+ defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+ defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+ defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+ defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+ defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+ defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+ defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+ defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
+ let isCommutable = 1 in
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+ !strconcat("vpcom${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ i8immZExt3:$cc)))]>,
+ XOP_4V;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+ !strconcat("vpcom${cc}", Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))),
+ i8immZExt3:$cc)))]>,
+ XOP_4V;
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat("vpcom", Suffix,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V;
+ let mayLoad = 1 in
+ def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat("vpcom", Suffix,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V;
+ }
+}
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+ defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
+ defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
+ defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
+ defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
+ defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
+ defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
+ defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
+ defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
+ XOP_4V, VEX_I8IMM;
+ def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2,
+ (bitconvert (loadv2i64 addr:$src3))))]>,
+ XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+ def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+ VR128:$src3))]>,
+ XOP_4V, VEX_I8IMM;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+ defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+}
+
+multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
+ XOP_4V, VEX_I8IMM, VEX_L;
+ def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int VR256:$src1, VR256:$src2,
+ (bitconvert (loadv4i64 addr:$src3))))]>,
+ XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+ def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
+ VR256:$src3))]>,
+ XOP_4V, VEX_I8IMM, VEX_L;
+}
+
+let ExeDomain = SSEPackedInt in
+ defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+
+let Predicates = [HasXOP] in {
+ def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, VR128:$src2))),
+ (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+ def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, VR256:$src2))),
+ (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
+ Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+ def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+ def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+ VEX_W, MemOp4;
+ def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+ def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+ def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
+ VEX_W, MemOp4, VEX_L;
+ def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+ VEX_L;
+}
+
+let ExeDomain = SSEPackedDouble in
+ defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+ int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
+
+let ExeDomain = SSEPackedSingle in
+ defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+ int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
new file mode 100644
index 0000000..646b556
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -0,0 +1,2042 @@
+//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the details for lowering X86 intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+
+namespace llvm {
+
+enum IntrinsicType {
+ INTR_NO_TYPE,
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
+ CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
+ INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
+ INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
+ FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
+ VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
+ TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+ EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+};
+
+struct IntrinsicData {
+
+ unsigned Id;
+ IntrinsicType Type;
+ unsigned Opc0;
+ unsigned Opc1;
+
+ bool operator<(const IntrinsicData &RHS) const {
+ return Id < RHS.Id;
+ }
+ bool operator==(const IntrinsicData &RHS) const {
+ return RHS.Id == Id;
+ }
+};
+
+#define X86_INTRINSIC_DATA(id, type, op0, op1) \
+ { Intrinsic::x86_##id, type, op0, op1 }
+
+/*
+ * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithChain[] = {
+ X86_INTRINSIC_DATA(addcarry_u32, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
+
+ X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
+
+ X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
+ X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
+ X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
+ X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
+ X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
+ X86::VSCATTERPF1DPDm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
+ X86::VSCATTERPF1DPSm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
+ X86::VSCATTERPF1QPDm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
+ X86::VSCATTERPF1QPSm),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
+
+ X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0),
+ X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0),
+ X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0),
+
+ X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
+ X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
+ X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
+};
+
+/*
+ * Find Intrinsic data by intrinsic ID
+ */
+static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
+
+ IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+ const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain),
+ IntrinsicToFind);
+ if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+ return Data;
+ return nullptr;
+}
+
+/*
+ * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithoutChain[] = {
+ X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0),
+ X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
+ X86ISD::FMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
+ X86ISD::FMSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+ X86ISD::FMSUBADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+ X86ISD::FMSUBADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+ X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+ X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
+ X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
+ X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_and_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_and_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_and_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_andn_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_andn_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_andn_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_b_128, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_b_256, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_b_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_d_128, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_d_256, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_d_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_q_128, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_q_256, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_q_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
+ X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
+ X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+ X86ISD::FSETCC),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+ X86ISD::FSETCC),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTDQ2PD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0), // no rm
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP_ROUND, X86ISD::VFPROUND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPEXT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_EXTEND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_EXTEND, X86ISD::VFPEXT),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VFPEXT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTUDQ2PD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0), // no rm
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+ X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+ X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
+ X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
+ X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+ X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+ X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+ X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+ X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::MOVSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::MOVSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSHDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSHDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSHDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSLDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSLDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSLDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+ X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+ X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
+ X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
+ X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_b_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_b_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_b_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_d_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_d_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_d_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_q_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_q_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_q_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_w_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_w_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padd_w_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::PALIGNR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::PALIGNR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::PALIGNR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pand_q_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pand_q_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pand_q_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pandn_d_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pandn_d_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pandn_d_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pandn_q_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pandn_q_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pandn_q_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512, CMP_MASK, X86ISD::PCMPEQM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
+ X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
+ X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK,
+ X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_q_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_q_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_q_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK,
+ X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK,
+ X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK,
+ X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_por_d_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_por_d_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_por_d_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK,
+ X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK,
+ X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
+ X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_d_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_d_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_d_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_q_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_q_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_q_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_w_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_w_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psub_w_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+ X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+ X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+ X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+ X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
+ X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
+ X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP16_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP16_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP16_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM,
+ ISD::FP_TO_FP16, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM,
+ ISD::FP_TO_FP16, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM,
+ ISD::FP_TO_FP16, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
+ X86ISD::FNMADD_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
+ X86ISD::FNMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
+ X86ISD::FNMSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
+ X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
+ X86ISD::FMADD_RND),
+
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
+ X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
+ X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
+ X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
+ X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+ X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, ISD::SMAX, 0),
+ X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, ISD::UMAX, 0),
+ X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0),
+ X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+ X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+ X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
+ X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0),
+ X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0)
+};
+
+/*
+ * Retrieve data for Intrinsic without chain.
+ * Return nullptr if intrinsic is not defined in the table.
+ */
+static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
+ IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+ const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain),
+ IntrinsicToFind);
+ if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+ return Data;
+ return nullptr;
+}
+
+static void verifyIntrinsicTables() {
+ assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain)) &&
+ std::is_sorted(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain)) &&
+ "Intrinsic data tables should be sorted by Intrinsic ID");
+ assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain)) ==
+ std::end(IntrinsicsWithoutChain)) &&
+ (std::adjacent_find(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain)) ==
+ std::end(IntrinsicsWithChain)) &&
+ "Intrinsic data tables should have unique entries");
+}
+
+// X86 specific compare constants.
+// They must be kept in synch with avxintrin.h
+#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
+#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
+#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
+#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
+#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
+#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
+#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
+#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
+#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
+#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
+#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
+#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
+#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
+#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
+#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
+#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
+#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
+#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
+#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
+#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */
+#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
+#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
+#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
+#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */
+#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
+#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
+#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
+#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
+#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
+#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
+#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
+#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */
+
+/*
+* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
+* Return tuple <isOrdered, X86 condcode>
+*/
+static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
+ ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
+ unsigned IntImm = CImm->getZExtValue();
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (IntImm) {
+ default: llvm_unreachable("Invalid floating point compare value for Comi!");
+ case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling)
+ case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling)
+ return std::make_tuple(true, X86::COND_E);
+ case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling)
+ case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling)
+ return std::make_tuple(false , X86::COND_E);
+ case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling)
+ case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_B);
+ case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling)
+ case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
+ return std::make_tuple(false , X86::COND_B);
+ case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling)
+ case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_BE);
+ case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling)
+ case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling)
+ return std::make_tuple(false, X86::COND_BE);
+ case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling)
+ case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_A);
+ case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling)
+ case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
+ return std::make_tuple(false, X86::COND_A);
+ case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling)
+ case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_AE);
+ case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling)
+ case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling)
+ return std::make_tuple(false, X86::COND_AE);
+ case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling)
+ case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling)
+ return std::make_tuple(true, X86::COND_NE);
+ case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling)
+ case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling)
+ return std::make_tuple(false, X86::COND_NE);
+ }
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 0000000..e1ca558
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,1459 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+ MCContext &Ctx;
+ const MachineFunction &MF;
+ const TargetMachine &TM;
+ const MCAsmInfo &MAI;
+ X86AsmPrinter &AsmPrinter;
+public:
+ X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
+
+ Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const;
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+ MachineModuleInfoMachO &getMachOMMI() const;
+ Mangler *getMang() const {
+ return AsmPrinter.Mang;
+ }
+};
+
+} // end anonymous namespace
+
+// Emit a minimal sequence of nops spanning NumBytes bytes.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+ const MCSubtargetInfo &STI);
+
+namespace llvm {
+ X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM)
+ : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {}
+
+ X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {}
+
+ void
+ X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) {
+ MF = &F;
+ CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+ *MF->getSubtarget().getInstrInfo(),
+ *MF->getSubtarget().getRegisterInfo(), MF->getContext()));
+ }
+
+ void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ if (InShadow) {
+ SmallString<256> Code;
+ SmallVector<MCFixup, 4> Fixups;
+ raw_svector_ostream VecOS(Code);
+ CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+ CurrentShadowSize += Code.size();
+ if (CurrentShadowSize >= RequiredShadowSize)
+ InShadow = false; // The shadow is big enough. Stop counting.
+ }
+ }
+
+ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+ MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
+ if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+ InShadow = false;
+ EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+ MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+ }
+ }
+
+ void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+ OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+ SMShadowTracker.count(Inst, getSubtargetInfo());
+ }
+} // end llvm namespace
+
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
+ X86AsmPrinter &asmprinter)
+ : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
+ AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+ return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+ const DataLayout &DL = MF.getDataLayout();
+ assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+ MCSymbol *Sym = nullptr;
+ SmallString<128> Name;
+ StringRef Suffix;
+
+ switch (MO.getTargetFlags()) {
+ case X86II::MO_DLLIMPORT:
+ // Handle dllimport linkage.
+ Name += "__imp_";
+ break;
+ case X86II::MO_DARWIN_STUB:
+ Suffix = "$stub";
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ Suffix = "$non_lazy_ptr";
+ break;
+ }
+
+ if (!Suffix.empty())
+ Name += DL.getPrivateGlobalPrefix();
+
+ unsigned PrefixLen = Name.size();
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ AsmPrinter.getNameWithPrefix(Name, GV);
+ } else if (MO.isSymbol()) {
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+ } else if (MO.isMBB()) {
+ assert(Suffix.empty());
+ Sym = MO.getMBB()->getSymbol();
+ }
+ unsigned OrigLen = Name.size() - PrefixLen;
+
+ Name += Suffix;
+ if (!Sym)
+ Sym = Ctx.getOrCreateSymbol(Name);
+
+ StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
+
+ // If the target flags on the operand changes the name of the symbol, do that
+ // before we return the symbol.
+ switch (MO.getTargetFlags()) {
+ default: break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getGVStubEntry(Sym);
+ if (!StubSym.getPointer()) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ break;
+ }
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getHiddenGVStubEntry(Sym);
+ if (!StubSym.getPointer()) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ break;
+ }
+ case X86II::MO_DARWIN_STUB: {
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getFnStubEntry(Sym);
+ if (StubSym.getPointer())
+ return Sym;
+
+ if (MO.isGlobal()) {
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ } else {
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(Ctx.getOrCreateSymbol(OrigName), false);
+ }
+ break;
+ }
+ }
+
+ return Sym;
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ const MCExpr *Expr = nullptr;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ // These affect the name of the symbol, not any suffix.
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_DARWIN_STUB:
+ break;
+
+ case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::createSub(Expr,
+ MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
+ Ctx),
+ Ctx);
+ break;
+ case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
+ case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+ case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+ case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+ case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+ case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+ case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+ case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+ case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+ case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+ case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+ case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
+ case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+ case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::createSub(Expr,
+ MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
+ Ctx);
+ if (MO.isJTI()) {
+ assert(MAI.doesSetDirectiveSuppressesReloc());
+ // If .set directive is supported, use it to reduce the number of
+ // relocations the assembler will generate for differences between
+ // local labels. This is only safe when the symbols are in the same
+ // section so we are restricting it to jumptable references.
+ MCSymbol *Label = Ctx.createTempSymbol();
+ AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
+ Expr = MCSymbolRefExpr::create(Label, Ctx);
+ }
+ break;
+ }
+
+ if (!Expr)
+ Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(MO.getOffset(), Ctx),
+ Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+ unsigned ImmOp = Inst.getNumOperands() - 1;
+ assert(Inst.getOperand(0).isReg() &&
+ (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+ ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+ Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(0).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(ImmOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+ unsigned NewOpcode = 0;
+ unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
+ if (Op0 == X86::AX && Op1 == X86::AL)
+ NewOpcode = X86::CBW;
+ break;
+ case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
+ if (Op0 == X86::EAX && Op1 == X86::AX)
+ NewOpcode = X86::CWDE;
+ break;
+ case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+ if (Op0 == X86::RAX && Op1 == X86::EAX)
+ NewOpcode = X86::CDQE;
+ break;
+ }
+
+ if (NewOpcode != 0) {
+ Inst = MCInst();
+ Inst.setOpcode(NewOpcode);
+ }
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+ unsigned Opcode) {
+ // Don't make these simplifications in 64-bit mode; other assemblers don't
+ // perform them because they make the code larger.
+ if (Printer.getSubtarget().is64Bit())
+ return;
+
+ bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+ unsigned AddrBase = IsStore;
+ unsigned RegOp = IsStore ? 0 : 5;
+ unsigned AddrOp = AddrBase + 3;
+ assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+ (Inst.getOperand(AddrOp).isExpr() ||
+ Inst.getOperand(AddrOp).isImm()) &&
+ "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(RegOp).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // Check whether this is an absolute address.
+ // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+ // to do this here.
+ bool Absolute = true;
+ if (Inst.getOperand(AddrOp).isExpr()) {
+ const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+ if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+ Absolute = false;
+ }
+
+ if (Absolute &&
+ (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(AddrOp);
+ MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+ Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
+ return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+}
+
+Optional<MCOperand>
+X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const {
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ return None;
+ return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm());
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ case MachineOperand::MO_MCSymbol:
+ return LowerSymbolOperand(MO, MO.getMCSymbol());
+ case MachineOperand::MO_JumpTableIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+ case MachineOperand::MO_ConstantPoolIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(
+ MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ return None;
+ }
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (const MachineOperand &MO : MI->operands())
+ if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+ OutMI.addOperand(MaybeMCOp.getValue());
+
+ // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+ switch (OutMI.getOpcode()) {
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ case X86::LEA16r:
+ case X86::LEA32r:
+ // LEA should have a segment register, but it must be empty.
+ assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+ "Unexpected # of LEA operands");
+ assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+ "LEA has segment specified!");
+ break;
+
+ // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+ // if one of the registers is extended, but other isn't.
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+
+ // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+ // inputs modeled as normal uses instead of implicit uses. As such, truncate
+ // off all but the first operand (the callee). FIXME: Change isel.
+ case X86::TAILJMPr64:
+ case X86::TAILJMPr64_REX:
+ case X86::CALL64r:
+ case X86::CALL64pcrel32: {
+ unsigned Opcode = OutMI.getOpcode();
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CLEANUPRET: {
+ // Replace CATCHRET with the appropriate RET.
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Replace CATCHRET with the appropriate RET.
+ const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+ unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(Subtarget));
+ OutMI.addOperand(MCOperand::createReg(ReturnReg));
+ break;
+ }
+
+ // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+ case X86::TAILJMPr:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64: {
+ unsigned Opcode;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::TAILJMPr: Opcode = X86::JMP32r; break;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
+ }
+
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::INC16r:
+ case X86::INC32r:
+ // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
+ if (!AsmPrinter.getSubtarget().is64Bit()) {
+ unsigned Opcode;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
+ case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
+ case X86::INC16r: Opcode = X86::INC16r_alt; break;
+ case X86::INC32r: Opcode = X86::INC32r_alt; break;
+ }
+ OutMI.setOpcode(Opcode);
+ }
+ break;
+
+ // These are pseudo-ops for OR to help with the OR->ADD transformation. We do
+ // this with an ugly goto in case the resultant OR uses EAX and needs the
+ // short form.
+ case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+ case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+ case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+ case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+ case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+ case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+ case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+ case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+ case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+ // Atomic load and store require a separate pseudo-inst because Acquire
+ // implies mayStore and Release implies mayLoad; fix these to regular MOV
+ // instructions here
+ case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+ case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+ case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+ case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+ case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+ case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
+ case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
+ case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+ case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
+ case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+ case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
+ case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+ case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
+ case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+ case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
+ case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+ case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
+ case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+ case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
+ case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+ case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
+ case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+ case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
+ case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+ case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
+ case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+ case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
+ case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+ case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
+ case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+ case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
+ case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+ case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
+ case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
+ case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
+ case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
+ case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
+ case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
+ case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
+ case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
+ case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
+
+ // We don't currently select the correct instruction form for instructions
+ // which have a short %eax, etc. form. Handle this by custom lowering, for
+ // now.
+ //
+ // Note, we are currently not handling the following instructions:
+ // MOV64ao8, MOV64o8a
+ // XCHG16ar, XCHG32ar, XCHG64ar
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break;
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break;
+ case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break;
+ case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break;
+ case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+ case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
+
+ case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break;
+ case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break;
+ case X86::ADC32ri: SimplifyShortImmForm(OutMI, X86::ADC32i32); break;
+ case X86::ADC64ri32: SimplifyShortImmForm(OutMI, X86::ADC64i32); break;
+ case X86::ADD8ri: SimplifyShortImmForm(OutMI, X86::ADD8i8); break;
+ case X86::ADD16ri: SimplifyShortImmForm(OutMI, X86::ADD16i16); break;
+ case X86::ADD32ri: SimplifyShortImmForm(OutMI, X86::ADD32i32); break;
+ case X86::ADD64ri32: SimplifyShortImmForm(OutMI, X86::ADD64i32); break;
+ case X86::AND8ri: SimplifyShortImmForm(OutMI, X86::AND8i8); break;
+ case X86::AND16ri: SimplifyShortImmForm(OutMI, X86::AND16i16); break;
+ case X86::AND32ri: SimplifyShortImmForm(OutMI, X86::AND32i32); break;
+ case X86::AND64ri32: SimplifyShortImmForm(OutMI, X86::AND64i32); break;
+ case X86::CMP8ri: SimplifyShortImmForm(OutMI, X86::CMP8i8); break;
+ case X86::CMP16ri: SimplifyShortImmForm(OutMI, X86::CMP16i16); break;
+ case X86::CMP32ri: SimplifyShortImmForm(OutMI, X86::CMP32i32); break;
+ case X86::CMP64ri32: SimplifyShortImmForm(OutMI, X86::CMP64i32); break;
+ case X86::OR8ri: SimplifyShortImmForm(OutMI, X86::OR8i8); break;
+ case X86::OR16ri: SimplifyShortImmForm(OutMI, X86::OR16i16); break;
+ case X86::OR32ri: SimplifyShortImmForm(OutMI, X86::OR32i32); break;
+ case X86::OR64ri32: SimplifyShortImmForm(OutMI, X86::OR64i32); break;
+ case X86::SBB8ri: SimplifyShortImmForm(OutMI, X86::SBB8i8); break;
+ case X86::SBB16ri: SimplifyShortImmForm(OutMI, X86::SBB16i16); break;
+ case X86::SBB32ri: SimplifyShortImmForm(OutMI, X86::SBB32i32); break;
+ case X86::SBB64ri32: SimplifyShortImmForm(OutMI, X86::SBB64i32); break;
+ case X86::SUB8ri: SimplifyShortImmForm(OutMI, X86::SUB8i8); break;
+ case X86::SUB16ri: SimplifyShortImmForm(OutMI, X86::SUB16i16); break;
+ case X86::SUB32ri: SimplifyShortImmForm(OutMI, X86::SUB32i32); break;
+ case X86::SUB64ri32: SimplifyShortImmForm(OutMI, X86::SUB64i32); break;
+ case X86::TEST8ri: SimplifyShortImmForm(OutMI, X86::TEST8i8); break;
+ case X86::TEST16ri: SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
+ case X86::TEST32ri: SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
+ case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
+ case X86::XOR8ri: SimplifyShortImmForm(OutMI, X86::XOR8i8); break;
+ case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break;
+ case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break;
+ case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break;
+
+ // Try to shrink some forms of movsx.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr32:
+ SimplifyMOVSX(OutMI);
+ break;
+ }
+}
+
+void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
+ const MachineInstr &MI) {
+
+ bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
+
+ bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+ MCContext &context = OutStreamer->getContext();
+
+ if (needsPadding)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+
+ MCSymbolRefExpr::VariantKind SRVK;
+ switch (MI.getOpcode()) {
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86::TLS_base_addr32:
+ SRVK = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86::TLS_base_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
+ MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+ const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
+
+ MCInst LEA;
+ if (is64Bits) {
+ LEA.setOpcode(X86::LEA64r);
+ LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
+ LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
+ LEA.addOperand(MCOperand::createImm(1)); // scale
+ LEA.addOperand(MCOperand::createReg(0)); // index
+ LEA.addOperand(MCOperand::createExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::createReg(0)); // seg
+ } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
+ LEA.addOperand(MCOperand::createImm(1)); // scale
+ LEA.addOperand(MCOperand::createReg(0)); // index
+ LEA.addOperand(MCOperand::createExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::createReg(0)); // seg
+ } else {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::createReg(0)); // base
+ LEA.addOperand(MCOperand::createImm(1)); // scale
+ LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
+ LEA.addOperand(MCOperand::createExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::createReg(0)); // seg
+ }
+ EmitAndCountInstruction(LEA);
+
+ if (needsPadding) {
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ }
+
+ StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+ MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
+ const MCSymbolRefExpr *tlsRef =
+ MCSymbolRefExpr::create(tlsGetAddr,
+ MCSymbolRefExpr::VK_PLT,
+ context);
+
+ EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+ : X86::CALLpcrel32)
+ .addExpr(tlsRef));
+}
+
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) {
+ // This works only for 64bit. For 32bit we have to do additional checking if
+ // the CPU supports multi-byte nops.
+ assert(Is64Bit && "EmitNops only supports X86-64");
+ while (NumBytes) {
+ unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+ Opc = IndexReg = Displacement = SegmentReg = 0;
+ BaseReg = X86::RAX; ScaleVal = 1;
+ switch (NumBytes) {
+ case 0: llvm_unreachable("Zero nops?"); break;
+ case 1: NumBytes -= 1; Opc = X86::NOOP; break;
+ case 2: NumBytes -= 2; Opc = X86::XCHG16ar; break;
+ case 3: NumBytes -= 3; Opc = X86::NOOPL; break;
+ case 4: NumBytes -= 4; Opc = X86::NOOPL; Displacement = 8; break;
+ case 5: NumBytes -= 5; Opc = X86::NOOPL; Displacement = 8;
+ IndexReg = X86::RAX; break;
+ case 6: NumBytes -= 6; Opc = X86::NOOPW; Displacement = 8;
+ IndexReg = X86::RAX; break;
+ case 7: NumBytes -= 7; Opc = X86::NOOPL; Displacement = 512; break;
+ case 8: NumBytes -= 8; Opc = X86::NOOPL; Displacement = 512;
+ IndexReg = X86::RAX; break;
+ case 9: NumBytes -= 9; Opc = X86::NOOPW; Displacement = 512;
+ IndexReg = X86::RAX; break;
+ default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512;
+ IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+ }
+
+ unsigned NumPrefixes = std::min(NumBytes, 5U);
+ NumBytes -= NumPrefixes;
+ for (unsigned i = 0; i != NumPrefixes; ++i)
+ OS.EmitBytes("\x66");
+
+ switch (Opc) {
+ default: llvm_unreachable("Unexpected opcode"); break;
+ case X86::NOOP:
+ OS.EmitInstruction(MCInstBuilder(Opc), STI);
+ break;
+ case X86::XCHG16ar:
+ OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+ break;
+ case X86::NOOPL:
+ case X86::NOOPW:
+ OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg)
+ .addImm(ScaleVal).addReg(IndexReg)
+ .addImm(Displacement).addReg(SegmentReg), STI);
+ break;
+ }
+ } // while (NumBytes)
+}
+
+void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
+
+ StatepointOpers SOpers(&MI);
+ if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+ EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
+ getSubtargetInfo());
+ } else {
+ // Lower call target and choose correct opcode
+ const MachineOperand &CallTarget = SOpers.getCallTarget();
+ MCOperand CallTargetMCOp;
+ unsigned CallOpcode;
+ switch (CallTarget.getType()) {
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ CallTargetMCOp = MCIL.LowerSymbolOperand(
+ CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
+ CallOpcode = X86::CALL64pcrel32;
+ // Currently, we only support relative addressing with statepoints.
+ // Otherwise, we'll need a scratch register to hold the target
+ // address. You'll fail asserts during load & relocation if this
+ // symbol is to far away. (TODO: support non-relative addressing)
+ break;
+ case MachineOperand::MO_Immediate:
+ CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+ CallOpcode = X86::CALL64pcrel32;
+ // Currently, we only support relative addressing with statepoints.
+ // Otherwise, we'll need a scratch register to hold the target
+ // immediate. You'll fail asserts during load & relocation if this
+ // address is to far away. (TODO: support non-relative addressing)
+ break;
+ case MachineOperand::MO_Register:
+ CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+ CallOpcode = X86::CALL64r;
+ break;
+ default:
+ llvm_unreachable("Unsupported operand type in statepoint call target");
+ break;
+ }
+
+ // Emit call
+ MCInst CallInst;
+ CallInst.setOpcode(CallOpcode);
+ CallInst.addOperand(CallTargetMCOp);
+ OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
+ }
+
+ // Record our statepoint node in the same section used by STACKMAP
+ // and PATCHPOINT
+ SM.recordStatepoint(MI);
+}
+
+void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ // FAULTING_LOAD_OP <def>, <handler label>, <load opcode>, <load operands>
+
+ unsigned LoadDefRegister = MI.getOperand(0).getReg();
+ MCSymbol *HandlerLabel = MI.getOperand(1).getMCSymbol();
+ unsigned LoadOpcode = MI.getOperand(2).getImm();
+ unsigned LoadOperandsBeginIdx = 3;
+
+ FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+
+ MCInst LoadMI;
+ LoadMI.setOpcode(LoadOpcode);
+
+ if (LoadDefRegister != X86::NoRegister)
+ LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
+ for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
+ E = MI.operands_end();
+ I != E; ++I)
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
+ LoadMI.addOperand(MaybeOperand.getValue());
+
+ OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ SM.recordStackMap(MI);
+ unsigned NumShadowBytes = MI.getOperand(1).getImm();
+ SMShadowTracker.reset(NumShadowBytes);
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
+
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+
+ SM.recordPatchPoint(MI);
+
+ PatchPointOpers opers(&MI);
+ unsigned ScratchIdx = opers.getNextScratchIdx();
+ unsigned EncodedBytes = 0;
+ const MachineOperand &CalleeMO =
+ opers.getMetaOper(PatchPointOpers::TargetPos);
+
+ // Check for null target. If target is non-null (i.e. is non-zero or is
+ // symbolic) then emit a call.
+ if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
+ MCOperand CalleeMCOp;
+ switch (CalleeMO.getType()) {
+ default:
+ /// FIXME: Add a verifier check for bad callee types.
+ llvm_unreachable("Unrecognized callee operand type.");
+ case MachineOperand::MO_Immediate:
+ if (CalleeMO.getImm())
+ CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ CalleeMCOp =
+ MCIL.LowerSymbolOperand(CalleeMO,
+ MCIL.GetSymbolFromOperand(CalleeMO));
+ break;
+ }
+
+ // Emit MOV to materialize the target address and the CALL to target.
+ // This is encoded with 12-13 bytes, depending on which register is used.
+ unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+ if (X86II::isX86_64ExtendedReg(ScratchReg))
+ EncodedBytes = 13;
+ else
+ EncodedBytes = 12;
+
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
+ }
+
+ // Emit padding.
+ unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+
+ EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
+ getSubtargetInfo());
+}
+
+// Returns instruction preceding MBBI in MachineFunction.
+// If MBBI is the first instruction of the first basic block, returns null.
+static MachineBasicBlock::const_iterator
+PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
+ const MachineBasicBlock *MBB = MBBI->getParent();
+ while (MBBI == MBB->begin()) {
+ if (MBB == MBB->getParent()->begin())
+ return nullptr;
+ MBB = MBB->getPrevNode();
+ MBBI = MBB->end();
+ }
+ return --MBBI;
+}
+
+static const Constant *getConstantFromPool(const MachineInstr &MI,
+ const MachineOperand &Op) {
+ if (!Op.isCPI())
+ return nullptr;
+
+ ArrayRef<MachineConstantPoolEntry> Constants =
+ MI.getParent()->getParent()->getConstantPool()->getConstants();
+ const MachineConstantPoolEntry &ConstantEntry =
+ Constants[Op.getIndex()];
+
+ // Bail if this is a machine constant pool entry, we won't be able to dig out
+ // anything useful.
+ if (ConstantEntry.isMachineConstantPoolEntry())
+ return nullptr;
+
+ auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+ assert((!C || ConstantEntry.getType() == C->getType()) &&
+ "Expected a constant of the same type!");
+ return C;
+}
+
+static std::string getShuffleComment(const MachineOperand &DstOp,
+ const MachineOperand &SrcOp,
+ ArrayRef<int> Mask) {
+ std::string Comment;
+
+ // Compute the name for a register. This is really goofy because we have
+ // multiple instruction printers that could (in theory) use different
+ // names. Fortunately most people use the ATT style (outside of Windows)
+ // and they actually agree on register naming here. Ultimately, this is
+ // a comment, and so its OK if it isn't perfect.
+ auto GetRegisterName = [](unsigned RegNum) -> StringRef {
+ return X86ATTInstPrinter::getRegisterName(RegNum);
+ };
+
+ StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
+ StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem";
+
+ raw_string_ostream CS(Comment);
+ CS << DstName << " = ";
+ bool NeedComma = false;
+ bool InSrc = false;
+ for (int M : Mask) {
+ // Wrap up any prior entry...
+ if (M == SM_SentinelZero && InSrc) {
+ InSrc = false;
+ CS << "]";
+ }
+ if (NeedComma)
+ CS << ",";
+ else
+ NeedComma = true;
+
+ // Print this shuffle...
+ if (M == SM_SentinelZero) {
+ CS << "zero";
+ } else {
+ if (!InSrc) {
+ InSrc = true;
+ CS << SrcName << "[";
+ }
+ if (M == SM_SentinelUndef)
+ CS << "u";
+ else
+ CS << M;
+ }
+ }
+ if (InSrc)
+ CS << "]";
+ CS.flush();
+
+ return Comment;
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ X86MCInstLower MCInstLowering(*MF, *this);
+ const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ switch (MI->getOpcode()) {
+ case TargetOpcode::DBG_VALUE:
+ llvm_unreachable("Should be handled target independently");
+
+ // Emit nothing here but a comment if we can.
+ case X86::Int_MemBarrier:
+ OutStreamer->emitRawComment("MEMBARRIER");
+ return;
+
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ // Lower these as normal, but add some comments.
+ unsigned Reg = MI->getOperand(0).getReg();
+ OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+ X86ATTInstPrinter::getRegisterName(Reg));
+ break;
+ }
+ case X86::CLEANUPRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CLEANUPRET");
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CATCHRET");
+ break;
+ }
+
+ case X86::TAILJMPr:
+ case X86::TAILJMPm:
+ case X86::TAILJMPd:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
+ case X86::TAILJMPd64_REX:
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("TAILCALL");
+ break;
+
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ return LowerTlsAddr(MCInstLowering, *MI);
+
+ case X86::MOVPC32r: {
+ // This is a pseudo op for a two instruction sequence with a label, which
+ // looks like:
+ // call "L1$pb"
+ // "L1$pb":
+ // popl %esi
+
+ // Emit the call.
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+ const X86FrameLowering* FrameLowering =
+ MF->getSubtarget<X86Subtarget>().getFrameLowering();
+ bool hasFP = FrameLowering->hasFP(*MF);
+
+ // TODO: This is needed only if we require precise CFA.
+ bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+ !OutStreamer->getDwarfFrameInfos().back().End;
+
+ int stackGrowth = -RI->getSlotSize();
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
+ }
+
+ // Emit the label.
+ OutStreamer->EmitLabel(PICBase);
+
+ // popl $reg
+ EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
+ .addReg(MI->getOperand(0).getReg()));
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
+ }
+ return;
+ }
+
+ case X86::ADD32ri: {
+ // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+ if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ break;
+
+ // Okay, we have something like:
+ // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+ // For this, we want to print something like:
+ // MYGLOBAL + (. - PICBASE)
+ // However, we can't generate a ".", so just emit a new label here and refer
+ // to it.
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(DotSym);
+
+ // Now that we have emitted the label, lower the complex operand expression.
+ MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+ const MCExpr *PICBase =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+ DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+ DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
+ DotExpr, OutContext);
+
+ EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(DotExpr));
+ return;
+ }
+ case TargetOpcode::STATEPOINT:
+ return LowerSTATEPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::FAULTING_LOAD_OP:
+ return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(*MI);
+
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(*MI, MCInstLowering);
+
+ case X86::MORESTACK_RET:
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ return;
+
+ case X86::MORESTACK_RET_RESTORE_R10:
+ // Return, then restore R10.
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
+ .addReg(X86::R10)
+ .addReg(X86::RAX));
+ return;
+
+ case X86::SEH_PushReg:
+ OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+ return;
+
+ case X86::SEH_SaveReg:
+ OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_SaveXMM:
+ OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_StackAlloc:
+ OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+ return;
+
+ case X86::SEH_SetFrame:
+ OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ return;
+
+ case X86::SEH_PushFrame:
+ OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+ return;
+
+ case X86::SEH_EndPrologue:
+ OutStreamer->EmitWinCFIEndProlog();
+ return;
+
+ case X86::SEH_Epilogue: {
+ MachineBasicBlock::const_iterator MBBI(MI);
+ // Check if preceded by a call and emit nop if so.
+ for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) {
+ // Conservatively assume that pseudo instructions don't emit code and keep
+ // looking for a call. We may emit an unnecessary nop in some cases.
+ if (!MBBI->isPseudo()) {
+ if (MBBI->isCall())
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ break;
+ }
+ }
+ return;
+ }
+
+ // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+ // a constant shuffle mask. We won't be able to do this at the MC layer
+ // because the mask isn't an immediate.
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrm:
+ case X86::VPSHUFBZrmk:
+ case X86::VPSHUFBZrmkz: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ unsigned SrcIdx, MaskIdx;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZrm:
+ SrcIdx = 1; MaskIdx = 5; break;
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrmkz:
+ SrcIdx = 2; MaskIdx = 6; break;
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZrmk:
+ SrcIdx = 3; MaskIdx = 7; break;
+ }
+
+ assert(MI->getNumOperands() >= 6 &&
+ "We should always have at least 6 operands!");
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &SrcOp = MI->getOperand(SrcIdx);
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ SmallVector<int, 16> Mask;
+ DecodePSHUFBMask(C, Mask);
+ if (!Mask.empty())
+ OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+ }
+ break;
+ }
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPDYrm: {
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ assert(MI->getNumOperands() > 5 &&
+ "We should always have at least 5 operands!");
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &SrcOp = MI->getOperand(1);
+ const MachineOperand &MaskOp = MI->getOperand(5);
+
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break;
+ case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break;
+ }
+
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ SmallVector<int, 16> Mask;
+ DecodeVPERMILPMask(C, ElSize, Mask);
+ if (!Mask.empty())
+ OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+ }
+ break;
+ }
+
+#define MOV_CASE(Prefix, Suffix) \
+ case X86::Prefix##MOVAPD##Suffix##rm: \
+ case X86::Prefix##MOVAPS##Suffix##rm: \
+ case X86::Prefix##MOVUPD##Suffix##rm: \
+ case X86::Prefix##MOVUPS##Suffix##rm: \
+ case X86::Prefix##MOVDQA##Suffix##rm: \
+ case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix) \
+ case X86::VMOVDQA64##Suffix##rm: \
+ case X86::VMOVDQA32##Suffix##rm: \
+ case X86::VMOVDQU64##Suffix##rm: \
+ case X86::VMOVDQU32##Suffix##rm: \
+ case X86::VMOVDQU16##Suffix##rm: \
+ case X86::VMOVDQU8##Suffix##rm: \
+ case X86::VMOVAPS##Suffix##rm: \
+ case X86::VMOVAPD##Suffix##rm: \
+ case X86::VMOVUPS##Suffix##rm: \
+ case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM() \
+ MOV_CASE(, ) /* SSE */ \
+ MOV_CASE(V, ) /* AVX-128 */ \
+ MOV_CASE(V, Y) /* AVX-256 */ \
+ MOV_AVX512_CASE(Z) \
+ MOV_AVX512_CASE(Z256) \
+ MOV_AVX512_CASE(Z128)
+
+ // For loads from a constant pool to a vector register, print the constant
+ // loaded.
+ CASE_ALL_MOV_RM()
+ if (!OutStreamer->isVerboseAsm())
+ break;
+ if (MI->getNumOperands() > 4)
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+ CS << "[";
+ for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+ if (i != 0)
+ CS << ",";
+ if (CDS->getElementType()->isIntegerTy())
+ CS << CDS->getElementAsInteger(i);
+ else if (CDS->getElementType()->isFloatTy())
+ CS << CDS->getElementAsFloat(i);
+ else if (CDS->getElementType()->isDoubleTy())
+ CS << CDS->getElementAsDouble(i);
+ else
+ CS << "?";
+ }
+ CS << "]";
+ OutStreamer->AddComment(CS.str());
+ } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+ CS << "<";
+ for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+ if (i != 0)
+ CS << ",";
+ Constant *COp = CV->getOperand(i);
+ if (isa<UndefValue>(COp)) {
+ CS << "u";
+ } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+ if (CI->getBitWidth() <= 64) {
+ CS << CI->getZExtValue();
+ } else {
+ // print multi-word constant as (w0,w1)
+ auto Val = CI->getValue();
+ CS << "(";
+ for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+ if (i > 0)
+ CS << ",";
+ CS << Val.getRawData()[i];
+ }
+ CS << ")";
+ }
+ } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+ SmallString<32> Str;
+ CF->getValueAPF().toString(Str);
+ CS << Str;
+ } else {
+ CS << "?";
+ }
+ }
+ CS << ">";
+ OutStreamer->AddComment(CS.str());
+ }
+ }
+ break;
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+
+ // Stackmap shadows cannot include branch targets, so we can count the bytes
+ // in a call towards the shadow, but must ensure that the no thread returns
+ // in to the stackmap shadow. The only way to achieve this is if the call
+ // is at the end of the shadow.
+ if (MI->isCall()) {
+ // Count then size of the call towards the shadow
+ SMShadowTracker.count(TmpInst, getSubtargetInfo());
+ // Then flush the shadow so that we fill with nops before the call, not
+ // after it.
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ // Then emit the call
+ OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
+ return;
+ }
+
+ EmitAndCountInstruction(TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 0000000..c9e636f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,33 @@
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
+
+void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
+ if (!RestoreBasePointerOffset) {
+ const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ unsigned SlotSize = RegInfo->getSlotSize();
+ for (const MCPhysReg *CSR =
+ RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
+ unsigned Reg = *CSR;
+ ++CSR)
+ {
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ RestoreBasePointerOffset -= SlotSize;
+ }
+ }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..3a7a98d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,167 @@
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include <vector>
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer = false;
+
+ /// RestoreBasePointerOffset - Non-zero if the function has base pointer
+ /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
+ /// displacement from the frame pointer to a slot where the base pointer
+ /// is stashed.
+ signed char RestoreBasePointerOffset = 0;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize = 0;
+
+ /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+ /// to the space used by the return address).
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn = 0;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex = 0;
+
+ /// \brief FrameIndex for return slot.
+ int FrameAddrIndex = 0;
+
+ /// TailCallReturnAddrDelta - The number of bytes by which return address
+ /// stack slot is moved as the result of tail call optimization.
+ int TailCallReturnAddrDelta = 0;
+
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg = 0;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ unsigned GlobalBaseReg = 0;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex = 0;
+ /// RegSaveFrameIndex - X86-64 vararg func register save area.
+ int RegSaveFrameIndex = 0;
+ /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+ unsigned VarArgsGPOffset = 0;
+ /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+ unsigned VarArgsFPOffset = 0;
+ /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+ /// being passed on the stack.
+ unsigned ArgumentStackSize = 0;
+ /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+ unsigned NumLocalDynamics = 0;
+ /// HasPushSequences - Keeps track of whether this function uses sequences
+ /// of pushes to pass function parameters.
+ bool HasPushSequences = false;
+
+ /// True if the function recovers from an SEH exception, and therefore needs
+ /// to spill and restore the frame pointer.
+ bool HasSEHFramePtrSave = false;
+
+ /// The frame index of a stack object containing the original frame pointer
+ /// used to address arguments in a function using a base pointer.
+ int SEHFramePtrSaveIndex = 0;
+
+private:
+ /// ForwardedMustTailRegParms - A list of virtual and physical registers
+ /// that must be forwarded to every musttail call.
+ SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+public:
+ X86MachineFunctionInfo() = default;
+
+ explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ bool getHasPushSequences() const { return HasPushSequences; }
+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+ bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+ void setRestoreBasePointer(const MachineFunction *MF);
+ int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getFAIndex() const { return FrameAddrIndex; }
+ void setFAIndex(int Index) { FrameAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+ void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+ unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+ void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+ unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+ void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+ unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+ void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+ bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+ void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+ int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+ void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
+ SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+ return ForwardedMustTailRegParms;
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 0000000..58020d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,326 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve code size.
+// Currently, it does one thing:
+// 1) Address calculations in load and store instructions are replaced by
+// existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden,
+ cl::desc("X86: Enable LEA optimizations."),
+ cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+ OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+ const char *getPassName() const override { return "X86 LEA Optimize"; }
+
+ /// \brief Loop over all of the basic blocks, replacing address
+ /// calculations in load and store instructions, if it's already
+ /// been calculated by LEA. Also, remove redundant LEAs.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ /// \brief Returns a distance between two instructions inside one basic block.
+ /// Negative result means, that instructions occur in reverse order.
+ int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+ /// \brief Choose the best \p LEA instruction from the \p List to replace
+ /// address calculation in \p MI instruction. Return the address displacement
+ /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
+ /// and \p Dist.
+ bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI, MachineInstr *&LEA,
+ int64_t &AddrDispShift, int &Dist);
+
+ /// \brief Returns true if two machine operand are identical and they are not
+ /// physical registers.
+ bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
+
+ /// \brief Returns true if the instruction is LEA.
+ bool isLEA(const MachineInstr &MI);
+
+ /// \brief Returns true if two instructions have memory operands that only
+ /// differ by displacement. The numbers of the first memory operands for both
+ /// instructions are specified through \p N1 and \p N2. The address
+ /// displacement is returned through AddrDispShift.
+ bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2, unsigned N2,
+ int64_t &AddrDispShift);
+
+ /// \brief Find all LEA instructions in the basic block.
+ void findLEAs(const MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineInstr *> &List);
+
+ /// \brief Removes redundant address calculations.
+ bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
+
+ static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+ const MachineInstr &Last) {
+ const MachineBasicBlock *MBB = First.getParent();
+
+ // Both instructions must be in the same basic block.
+ assert(Last.getParent() == MBB &&
+ "Instructions are in different basic blocks");
+
+ return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
+ std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+// the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+// register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+// possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI, MachineInstr *&LEA,
+ int64_t &AddrDispShift, int &Dist) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+ X86II::getOperandBias(Desc);
+
+ LEA = nullptr;
+
+ // Loop over all LEA instructions.
+ for (auto DefMI : List) {
+ int64_t AddrDispShiftTemp = 0;
+
+ // Compare instructions memory operands.
+ if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
+ continue;
+
+ // Make sure address displacement fits 4 bytes.
+ if (!isInt<32>(AddrDispShiftTemp))
+ continue;
+
+ // Check that LEA def register can be used as MI address base. Some
+ // instructions can use a limited set of registers as address base, for
+ // example MOV8mr_NOREX. We could constrain the register class of the LEA
+ // def to suit MI, however since this case is very rare and hard to
+ // reproduce in a test it's just more reliable to skip the LEA.
+ if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+ MRI->getRegClass(DefMI->getOperand(0).getReg()))
+ continue;
+
+ // Choose the closest LEA instruction from the list, prior to MI if
+ // possible. Note that we took into account resulting address displacement
+ // as well. Also note that the list is sorted by the order in which the LEAs
+ // occur, so the break condition is pretty simple.
+ int DistTemp = calcInstrDist(*DefMI, MI);
+ assert(DistTemp != 0 &&
+ "The distance between two different instructions cannot be zero");
+ if (DistTemp > 0 || LEA == nullptr) {
+ // Do not update return LEA, if the current one provides a displacement
+ // which fits in 1 byte, while the new candidate does not.
+ if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+ isInt<8>(AddrDispShift))
+ continue;
+
+ LEA = DefMI;
+ AddrDispShift = AddrDispShiftTemp;
+ Dist = DistTemp;
+ }
+
+ // FIXME: Maybe we should not always stop at the first LEA after MI.
+ if (DistTemp < 0)
+ break;
+ }
+
+ return LEA != nullptr;
+}
+
+bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
+ const MachineOperand &MO2) {
+ return MO1.isIdenticalTo(MO2) &&
+ (!MO1.isReg() ||
+ !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+// Check if MI1 and MI2 have memory operands which represent addresses that
+// differ only by displacement.
+bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2, unsigned N2,
+ int64_t &AddrDispShift) {
+ // Address base, scale, index and segment operands must be identical.
+ static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
+ X86::AddrIndexReg, X86::AddrSegmentReg};
+ for (auto &N : IdenticalOpNums)
+ if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
+ return false;
+
+ // Address displacement operands may differ by a constant.
+ const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
+ const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
+ if (!isIdenticalOp(*Op1, *Op2)) {
+ if (Op1->isImm() && Op2->isImm())
+ AddrDispShift = Op1->getImm() - Op2->getImm();
+ else if (Op1->isGlobal() && Op2->isGlobal() &&
+ Op1->getGlobal() == Op2->getGlobal())
+ AddrDispShift = Op1->getOffset() - Op2->getOffset();
+ else
+ return false;
+ }
+
+ return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineInstr *> &List) {
+ for (auto &MI : MBB) {
+ if (isLEA(MI))
+ List.push_back(const_cast<MachineInstr *>(&MI));
+ }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(
+ const SmallVectorImpl<MachineInstr *> &List) {
+ bool Changed = false;
+
+ assert(List.size() > 0);
+ MachineBasicBlock *MBB = List[0]->getParent();
+
+ // Process all instructions in basic block.
+ for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr &MI = *I++;
+ unsigned Opcode = MI.getOpcode();
+
+ // Instruction must be load or store.
+ if (!MI.mayLoadOrStore())
+ continue;
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+
+ // If instruction has no memory operand - skip it.
+ if (MemOpNo < 0)
+ continue;
+
+ MemOpNo += X86II::getOperandBias(Desc);
+
+ // Get the best LEA instruction to replace address calculation.
+ MachineInstr *DefMI;
+ int64_t AddrDispShift;
+ int Dist;
+ if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+ continue;
+
+ // If LEA occurs before current instruction, we can freely replace
+ // the instruction. If LEA occurs after, we can lift LEA above the
+ // instruction and this way to be able to replace it. Since LEA and the
+ // instruction have similar memory operands (thus, the same def
+ // instructions for these operands), we can always do that, without
+ // worries of using registers before their defs.
+ if (Dist < 0) {
+ DefMI->removeFromParent();
+ MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+ }
+
+ // Since we can possibly extend register lifetime, clear kill flags.
+ MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+ ++NumSubstLEAs;
+ DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+ // Change instruction operands.
+ MI.getOperand(MemOpNo + X86::AddrBaseReg)
+ .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+ MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+ MI.getOperand(MemOpNo + X86::AddrIndexReg)
+ .ChangeToRegister(X86::NoRegister, false);
+ MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+ MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+ .ChangeToRegister(X86::NoRegister, false);
+
+ DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ // Perform this optimization only if we care about code size.
+ if (!EnableX86LEAOpt || !MF.getFunction()->optForSize())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Process all basic blocks.
+ for (auto &MBB : MF) {
+ SmallVector<MachineInstr *, 16> LEAs;
+
+ // Find all LEA instructions in basic block.
+ findLEAs(MBB, LEAs);
+
+ // If current basic block has no LEAs, move on to the next one.
+ if (LEAs.empty())
+ continue;
+
+ // Remove redundant address calculations.
+ Changed |= removeRedundantAddrCalc(LEAs);
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 0000000..0f425e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,213 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+ struct VisitedBBInfo {
+ // HasReturn - Whether the BB contains a return instruction
+ bool HasReturn;
+
+ // Cycles - Number of cycles until return if HasReturn is true, otherwise
+ // number of cycles until end of the BB
+ unsigned int Cycles;
+
+ VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+ VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+ : HasReturn(HasReturn), Cycles(Cycles) {}
+ };
+
+ struct PadShortFunc : public MachineFunctionPass {
+ static char ID;
+ PadShortFunc() : MachineFunctionPass(ID)
+ , Threshold(4), STI(nullptr), TII(nullptr) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "X86 Atom pad short functions";
+ }
+
+ private:
+ void findReturns(MachineBasicBlock *MBB,
+ unsigned int Cycles = 0);
+
+ bool cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles);
+
+ void addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd);
+
+ const unsigned int Threshold;
+
+ // ReturnBBs - Maps basic blocks that return to the minimum number of
+ // cycles until the return, starting from the entry block.
+ DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+ // VisitedBBs - Cache of previously visited BBs.
+ DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+ const X86Subtarget *STI;
+ const TargetInstrInfo *TII;
+ };
+
+ char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+ return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getFunction()->optForSize()) {
+ return false;
+ }
+
+ STI = &MF.getSubtarget<X86Subtarget>();
+ if (!STI->padShortFunctions())
+ return false;
+
+ TII = STI->getInstrInfo();
+
+ // Search through basic blocks and mark the ones that have early returns
+ ReturnBBs.clear();
+ VisitedBBs.clear();
+ findReturns(&MF.front());
+
+ bool MadeChange = false;
+
+ MachineBasicBlock *MBB;
+ unsigned int Cycles = 0;
+
+ // Pad the identified basic blocks with NOOPs
+ for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+ I != ReturnBBs.end(); ++I) {
+ MBB = I->first;
+ Cycles = I->second;
+
+ if (Cycles < Threshold) {
+ // BB ends in a return. Skip over any DBG_VALUE instructions
+ // trailing the terminator.
+ assert(MBB->size() > 0 &&
+ "Basic block should contain at least a RET but is empty");
+ MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+ while (ReturnLoc->isDebugValue())
+ --ReturnLoc;
+ assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+ "Basic block does not end with RET");
+
+ addPadding(MBB, ReturnLoc, Threshold - Cycles);
+ NumBBsPadded++;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+ // If this BB has a return, note how many cycles it takes to get there.
+ bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+ if (Cycles >= Threshold)
+ return;
+
+ if (hasReturn) {
+ ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+ return;
+ }
+
+ // Follow branches in BB and look for returns
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+ I != MBB->succ_end(); ++I) {
+ if (*I == MBB)
+ continue;
+ findReturns(*I, Cycles);
+ }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles) {
+ // Return cached result if BB was previously visited
+ DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+ = VisitedBBs.find(MBB);
+ if (it != VisitedBBs.end()) {
+ VisitedBBInfo BBInfo = it->second;
+ Cycles += BBInfo.Cycles;
+ return BBInfo.HasReturn;
+ }
+
+ unsigned int CyclesToEnd = 0;
+
+ for (MachineBasicBlock::iterator MBBI = MBB->begin();
+ MBBI != MBB->end(); ++MBBI) {
+ MachineInstr *MI = MBBI;
+ // Mark basic blocks with a return instruction. Calls to other
+ // functions do not count because the called function will be padded,
+ // if necessary.
+ if (MI->isReturn() && !MI->isCall()) {
+ VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return true;
+ }
+
+ CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
+ }
+
+ VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd) {
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ while (NOOPsToAdd-- > 0) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..5840443
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,639 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(const Triple &TT)
+ : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
+ X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true),
+ (TT.isArch64Bit() ? X86::RIP : X86::EIP)) {
+ X86_MC::InitLLVM2SEHRegisterMapping(this);
+
+ // Cache some information.
+ Is64Bit = TT.isArch64Bit();
+ IsWin64 = Is64Bit && TT.isOSWindows();
+
+ // Use a callee-saved register as the base pointer. These registers must
+ // not conflict with any ABI requirements. For example, in 32-bit mode PIC
+ // requires GOT in the EBX register before function calls via PLT GOT pointer.
+ if (Is64Bit) {
+ SlotSize = 8;
+ // This matches the simplified 32-bit pointer code in the data layout
+ // computation.
+ // FIXME: Should use the data layout?
+ bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32;
+ StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
+ FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
+ BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ BasePtr = X86::ESI;
+ }
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ // ExeDepsFixer and PostRAScheduler require liveness.
+ return true;
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+ return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ // It behaves just like the sub_8bit_hi index.
+ if (!Is64Bit && Idx == X86::sub_8bit)
+ Idx = X86::sub_8bit_hi;
+
+ // Forward to TableGen's default version.
+ return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned SubIdx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ if (!Is64Bit && SubIdx == X86::sub_8bit) {
+ A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+ if (!A)
+ return nullptr;
+ }
+ return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+ // Don't allow super-classes of GR8_NOREX. This class is only used after
+ // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied
+ // to the full GR8 register class in 64-bit mode, so we cannot allow the
+ // reigster class inflation.
+ //
+ // The GR8_NOREX class is always used in a way that won't be constrained to a
+ // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+ // full GR8 class.
+ if (RC == &X86::GR8_NOREXRegClass)
+ return RC;
+
+ const TargetRegisterClass *Super = RC;
+ TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+ do {
+ switch (Super->getID()) {
+ case X86::GR8RegClassID:
+ case X86::GR16RegClassID:
+ case X86::GR32RegClassID:
+ case X86::GR64RegClassID:
+ case X86::FR32RegClassID:
+ case X86::FR64RegClassID:
+ case X86::RFP32RegClassID:
+ case X86::RFP64RegClassID:
+ case X86::RFP80RegClassID:
+ case X86::VR128RegClassID:
+ case X86::VR256RegClassID:
+ // Don't return a super-class that would shrink the spill size.
+ // That can happen with the vector and float classes.
+ if (Super->getSize() == RC->getSize())
+ return Super;
+ }
+ Super = *I++;
+ } while (Super);
+ return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ switch (Kind) {
+ default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+ case 0: // Normal GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64RegClass;
+ return &X86::GR32RegClass;
+ case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOSPRegClass;
+ return &X86::GR32_NOSPRegClass;
+ case 2: // NOREX GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREXRegClass;
+ return &X86::GR32_NOREXRegClass;
+ case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREX_NOSPRegClass;
+ return &X86::GR32_NOREX_NOSPRegClass;
+ case 4: // Available for tailcall (not callee-saved GPRs).
+ return getGPRsForTailCall(MF);
+ }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+ return &X86::GR64_TCW64RegClass;
+ else if (Is64Bit)
+ return &X86::GR64_TCRegClass;
+
+ bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+ if (hasHipeCC)
+ return &X86::GR32RegClass;
+ return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &X86::CCRRegClass) {
+ if (Is64Bit)
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+ }
+ return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+
+ unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case X86::GR32RegClassID:
+ return 4 - FPDiff;
+ case X86::GR64RegClassID:
+ return 12 - FPDiff;
+ case X86::VR128RegClassID:
+ return Is64Bit ? 10 : 4;
+ case X86::VR64RegClassID:
+ return 4;
+ }
+}
+
+const MCPhysReg *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+ bool CallsEHReturn = MF->getMMI().callsEHReturn();
+
+ assert(MF && "MachineFunction required");
+ switch (MF->getFunction()->getCallingConv()) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_SaveList;
+ case CallingConv::AnyReg:
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ return CSR_64_AllRegs_SaveList;
+ case CallingConv::PreserveMost:
+ return CSR_64_RT_MostRegs_SaveList;
+ case CallingConv::PreserveAll:
+ if (HasAVX)
+ return CSR_64_RT_AllRegs_AVX_SaveList;
+ return CSR_64_RT_AllRegs_SaveList;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return CSR_64_TLS_Darwin_SaveList;
+ break;
+ case CallingConv::Intel_OCL_BI: {
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_SaveList;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_SaveList;
+ break;
+ }
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_SaveList;
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_64_MostRegs_SaveList;
+ break;
+ case CallingConv::X86_64_Win64:
+ return CSR_Win64_SaveList;
+ case CallingConv::X86_64_SysV:
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ else
+ return CSR_64_AllRegs_SaveList;
+ } else {
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_SaveList;
+ else
+ return CSR_32_AllRegs_SaveList;
+ }
+ default:
+ break;
+ }
+
+ if (Is64Bit) {
+ if (IsWin64)
+ return CSR_Win64_SaveList;
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
+ }
+ if (CallsEHReturn)
+ return CSR_32EHRet_SaveList;
+ return CSR_32_SaveList;
+}
+
+const uint32_t *
+X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ switch (CC) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_RegMask;
+ case CallingConv::AnyReg:
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ return CSR_64_AllRegs_RegMask;
+ case CallingConv::PreserveMost:
+ return CSR_64_RT_MostRegs_RegMask;
+ case CallingConv::PreserveAll:
+ if (HasAVX)
+ return CSR_64_RT_AllRegs_AVX_RegMask;
+ return CSR_64_RT_AllRegs_RegMask;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return CSR_64_TLS_Darwin_RegMask;
+ break;
+ case CallingConv::Intel_OCL_BI: {
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_RegMask;
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_RegMask;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_RegMask;
+ break;
+ }
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_RegMask;
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_64_MostRegs_RegMask;
+ break;
+ case CallingConv::X86_64_Win64:
+ return CSR_Win64_RegMask;
+ case CallingConv::X86_64_SysV:
+ return CSR_64_RegMask;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ else
+ return CSR_64_AllRegs_RegMask;
+ } else {
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_RegMask;
+ else
+ return CSR_32_AllRegs_RegMask;
+ }
+ default:
+ break;
+ }
+
+ // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+ // callsEHReturn().
+ if (Is64Bit) {
+ if (IsWin64)
+ return CSR_Win64_RegMask;
+ return CSR_64_RegMask;
+ }
+ return CSR_32_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+ return CSR_64_TLS_Darwin_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+
+ // Set the stack-pointer register and its aliases as reserved.
+ for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
+ Reserved.set(*I);
+
+ // Set the instruction pointer register and its aliases as reserved.
+ for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
+ Reserved.set(*I);
+
+ // Set the frame-pointer register and its aliases as reserved if needed.
+ if (TFI->hasFP(MF)) {
+ for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
+ Reserved.set(*I);
+ }
+
+ // Set the base-pointer register and its aliases as reserved if needed.
+ if (hasBasePointer(MF)) {
+ CallingConv::ID CC = MF.getFunction()->getCallingConv();
+ const uint32_t *RegMask = getCallPreservedMask(MF, CC);
+ if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+ report_fatal_error(
+ "Stack realignment in presence of dynamic allocas is not supported with"
+ "this calling convention.");
+
+ unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+ for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
+ I.isValid(); ++I)
+ Reserved.set(*I);
+ }
+
+ // Mark the segment registers as reserved.
+ Reserved.set(X86::CS);
+ Reserved.set(X86::SS);
+ Reserved.set(X86::DS);
+ Reserved.set(X86::ES);
+ Reserved.set(X86::FS);
+ Reserved.set(X86::GS);
+
+ // Mark the floating point stack registers as reserved.
+ for (unsigned n = 0; n != 8; ++n)
+ Reserved.set(X86::ST0 + n);
+
+ // Reserve the registers that only exist in 64-bit mode.
+ if (!Is64Bit) {
+ // These 8-bit registers are part of the x86-64 extension even though their
+ // super-registers are old 32-bits.
+ Reserved.set(X86::SIL);
+ Reserved.set(X86::DIL);
+ Reserved.set(X86::BPL);
+ Reserved.set(X86::SPL);
+
+ for (unsigned n = 0; n != 8; ++n) {
+ // R8, R9, ...
+ for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+
+ // XMM8, XMM9, ...
+ for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+ if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) {
+ for (unsigned n = 16; n != 32; ++n) {
+ for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+
+ return Reserved;
+}
+
+void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+ // Check if the EFLAGS register is marked as live-out. This shouldn't happen,
+ // because the calling convention defines the EFLAGS register as NOT
+ // preserved.
+ //
+ // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding
+ // an assert to track this and clear the register afterwards to avoid
+ // unnecessary crashes during release builds.
+ assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) &&
+ "EFLAGS are not live-out from a patchpoint.");
+
+ // Also clean other registers that don't need preserving (IP).
+ for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP})
+ Mask[Reg / 32] &= ~(1U << (Reg % 32));
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static bool CantUseSP(const MachineFrameInfo *MFI) {
+ return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
+}
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment, we can't address the stack from the frame
+ // pointer. When we have dynamic allocas or stack-adjusting inline asm, we
+ // can't address variables from the stack pointer. MS inline asm can
+ // reference locals while also adjusting the stack pointer. When we can't
+ // use both the SP and the FP, we need a separate base pointer register.
+ bool CantUseFP = needsStackRealignment(MF);
+ return CantUseFP && CantUseSP(MFI);
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(FramePtr))
+ return false;
+
+ // If a base pointer is necessary. Check that it isn't too late to reserve
+ // it.
+ if (CantUseSP(MFI))
+ return MRI->canReserveReg(BasePtr);
+ return true;
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+ unsigned Reg, int &FrameIdx) const {
+ // Since X86 defines assignCalleeSavedSpillSlots which always return true
+ // this function neither used nor tested.
+ llvm_unreachable("Unused function on X86. Otherwise need a test case.");
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ unsigned BasePtr;
+
+ unsigned Opc = MI.getOpcode();
+ bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
+ Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
+
+ if (hasBasePointer(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+ else if (needsStackRealignment(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+ else if (AfterFPPop)
+ BasePtr = StackPtr;
+ else
+ BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+ // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
+ // simple FP case, and doesn't work with stack realignment. On 32-bit, the
+ // offset is from the traditional base pointer location. On 64-bit, the
+ // offset is from the SP at the end of the prologue, not the FP location. This
+ // matches the behavior of llvm.frameaddress.
+ unsigned IgnoredFrameReg;
+ if (Opc == TargetOpcode::LOCAL_ESCAPE) {
+ MachineOperand &FI = MI.getOperand(FIOperandNum);
+ int Offset;
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+ FI.ChangeToImmediate(Offset);
+ return;
+ }
+
+ // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
+ // register as source operand, semantic is the same and destination is
+ // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+ if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
+ BasePtr = getX86SubSuperRegister(BasePtr, 64);
+
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register with EBP. Add an offset to the offset.
+ MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+ // Now add the frame object offset to the offset from EBP.
+ int FIOffset;
+ if (AfterFPPop) {
+ // Tail call jmp happens after FP is popped.
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+ } else
+ FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+
+ if (BasePtr == StackPtr)
+ FIOffset += SPAdj;
+
+ // The frame index format for stackmaps and patchpoints is different from the
+ // X86 format. It only has a FI and an offset.
+ if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+ assert(BasePtr == FramePtr && "Expected the FP as base register");
+ int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ if (MI.getOperand(FIOperandNum+3).isImm()) {
+ // Offset is a 32-bit integer.
+ int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+ int Offset = FIOffset + Imm;
+ assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+ "Requesting 64-bit offset in 32-bit immediate!");
+ MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+ } else {
+ // Offset is symbolic. This is extremely rare.
+ uint64_t Offset = FIOffset +
+ (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+ MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+ }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned
+X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ unsigned FrameReg = getFrameRegister(MF);
+ if (Subtarget.isTarget64BitILP32())
+ FrameReg = getX86SubSuperRegister(FrameReg, 32);
+ return FrameReg;
+}
+
+unsigned llvm::get512BitSuperRegister(unsigned Reg) {
+ if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
+ return X86::ZMM0 + (Reg - X86::XMM0);
+ if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
+ return X86::ZMM0 + (Reg - X86::YMM0);
+ if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
+ return Reg;
+ llvm_unreachable("Unexpected SIMD register");
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..f014c8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,143 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+ class Triple;
+
+class X86RegisterInfo final : public X86GenRegisterInfo {
+private:
+ /// Is64Bit - Is the target 64-bits.
+ ///
+ bool Is64Bit;
+
+ /// IsWin64 - Is the target on of win64 flavours
+ ///
+ bool IsWin64;
+
+ /// SlotSize - Stack slot size in bytes.
+ ///
+ unsigned SlotSize;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ ///
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ ///
+ unsigned FramePtr;
+
+ /// BasePtr - X86 physical register used as a base ptr in complex stack
+ /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+ /// variable size stack objects.
+ unsigned BasePtr;
+
+public:
+ X86RegisterInfo(const Triple &TT);
+
+ // FIXME: This should be tablegen'd like getDwarfRegNum is
+ int getSEHRegNum(unsigned i) const;
+
+ /// Code Generation virtual methods...
+ ///
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+ /// getMatchingSuperRegClass - Return a subclass of the specified register
+ /// class A so that each register in it has a sub-register of the
+ /// specified sub-register index which is in the specified register class B.
+ const TargetRegisterClass *
+ getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned Idx) const override;
+
+ const TargetRegisterClass *
+ getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const override;
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+ /// values.
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+ /// getGPRsForTailCall - Returns a register class with registers that can be
+ /// used in forming tail calls.
+ const TargetRegisterClass *
+ getGPRsForTailCall(const MachineFunction &MF) const;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction* MF) const override;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+ const uint32_t *getNoPreservedMask() const override;
+
+ // Calls involved in thread-local variable lookup save more registers than
+ // normal calls, so they need a different mask to represent this.
+ const uint32_t *getDarwinTLSCallPreservedMask() const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+ bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+ int &FrameIdx) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
+ unsigned getStackRegister() const { return StackPtr; }
+ unsigned getBaseRegister() const { return BasePtr; }
+ // FIXME: Move to FrameInfok
+ unsigned getSlotSize() const { return SlotSize; }
+};
+
+//get512BitRegister - X86 utility - returns 512-bit super register
+unsigned get512BitSuperRegister(unsigned Reg);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..56f0d93
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,497 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+ let Namespace = "X86";
+ let HWEncoding = Enc;
+ let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+ def sub_8bit : SubRegIndex<8>;
+ def sub_8bit_hi : SubRegIndex<8, 8>;
+ def sub_16bit : SubRegIndex<16>;
+ def sub_32bit : SubRegIndex<32>;
+ def sub_xmm : SubRegIndex<128>;
+ def sub_ymm : SubRegIndex<256>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others. We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL : X86Reg<"sil", 6>;
+def DIL : X86Reg<"dil", 7>;
+def BPL : X86Reg<"bpl", 5>;
+def SPL : X86Reg<"spl", 4>;
+def R8B : X86Reg<"r8b", 8>;
+def R9B : X86Reg<"r9b", 9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W : X86Reg<"r8w", 8, [R8B]>;
+def R9W : X86Reg<"r9w", 9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D : X86Reg<"r8d", 8, [R8W]>;
+def R9D : X86Reg<"r9d", 9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>;
+def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
+} // CostPerUse
+
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_xmm] in {
+ foreach Index = 0-31 in {
+ def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+ foreach Index = 0-31 in {
+ def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+def DR8 : X86Reg<"dr8", 8>;
+def DR9 : X86Reg<"dr9", 9>;
+def DR10 : X86Reg<"dr10", 10>;
+def DR11 : X86Reg<"dr11", 11>;
+def DR12 : X86Reg<"dr12", 12>;
+def DR13 : X86Reg<"dr13", 13>;
+def DR14 : X86Reg<"dr14", 14>;
+def DR15 : X86Reg<"dr15", 15>;
+
+// Control registers
+def CR0 : X86Reg<"cr0", 0>;
+def CR1 : X86Reg<"cr1", 1>;
+def CR2 : X86Reg<"cr2", 2>;
+def CR3 : X86Reg<"cr3", 3>;
+def CR4 : X86Reg<"cr4", 4>;
+def CR5 : X86Reg<"cr5", 5>;
+def CR6 : X86Reg<"cr6", 6>;
+def CR7 : X86Reg<"cr7", 7>;
+def CR8 : X86Reg<"cr8", 8>;
+def CR9 : X86Reg<"cr9", 9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+// Bound registers, used in MPX instructions
+def BND0 : X86Reg<"bnd0", 0>;
+def BND1 : X86Reg<"bnd1", 1>;
+def BND2 : X86Reg<"bnd2", 2>;
+def BND3 : X86Reg<"bnd3", 3>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+ let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+// segment register as one operand. Always contain a 16-bit segment
+// descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+ R8, R9, R10, R11, RIP)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+ let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+ (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+ (and GR64_NOREX, GR64_NOSP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+ let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 31)>;
+
+// Mask registers
+def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;}
+def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;}
+def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;}
+def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;}
+def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+
+def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;}
+def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;}
+def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;}
+def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;}
+def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;}
+def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
+
+// Bound registers
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 0000000..677e824
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,2147 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 4;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>;
+def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>;
+def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+ HWPort5, HWPort6, HWPort7]> {
+ let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+// Store_addr on 237.
+// Store_data on 4.
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [HWPort0156]>;
+def : WriteRes<WriteZero, []>;
+
+defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : HWWriteResPair<WriteShift, HWPort06, 1>;
+defm : HWWriteResPair<WriteJump, HWPort06, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>;
+defm : HWWriteResPair<WriteFBlend, HWPort015, 1>;
+defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>;
+
+def : WriteRes<WriteFVarBlend, [HWPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort0, 1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
+defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteShuffle, HWPort5, 1>;
+defm : HWWriteResPair<WriteBlend, HWPort15, 1>;
+defm : HWWriteResPair<WriteShuffle256, HWPort5, 3>;
+
+def : WriteRes<WriteVarBlend, [HWPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 2];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+ let Latency = 10;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [6, 2, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
+ let Latency = 11;
+ let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+ let Latency = 14;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence, [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
+
+//================ Exceptions ================//
+
+//-- Specific Scheduling Models --//
+
+// Starting with P0.
+def WriteP0 : SchedWriteRes<[HWPort0]>;
+
+def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP01 : SchedWriteRes<[HWPort01]>;
+
+def Write2P01 : SchedWriteRes<[HWPort01]> {
+ let NumMicroOps = 2;
+}
+def Write3P01 : SchedWriteRes<[HWPort01]> {
+ let NumMicroOps = 3;
+}
+
+def WriteP015 : SchedWriteRes<[HWPort015]>;
+
+def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
+ let NumMicroOps = 2;
+}
+def WriteP06 : SchedWriteRes<[HWPort06]>;
+
+def Write2P06 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+
+def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+
+def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let NumMicroOps = 2;
+}
+
+def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+
+def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def Write5P0156 : SchedWriteRes<[HWPort0156]> {
+ let NumMicroOps = 5;
+ let ResourceCycles = [5];
+}
+
+def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 2, 1];
+}
+
+def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [2, 2, 1];
+}
+
+def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [3, 2, 1];
+}
+
+// Starting with P1.
+def WriteP1 : SchedWriteRes<[HWPort1]>;
+
+def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+ let NumMicroOps = 2;
+}
+def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+}
+def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
+ let Latency = 7;
+}
+
+def Write2P1 : SchedWriteRes<[HWPort1]> {
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def WriteP15 : SchedWriteRes<[HWPort15]>;
+def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
+ let Latency = 4;
+}
+
+def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+
+// Starting with P2.
+def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
+ let Latency = 1;
+ let ResourceCycles = [2, 1];
+}
+
+// Starting with P5.
+def WriteP5 : SchedWriteRes<[HWPort5]>;
+def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+
+// Notation:
+// - r: register.
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+// - m = memory.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// CMOVcc.
+// r,r.
+def : InstRW<[Write2P0156_Lat2],
+ (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
+ (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
+
+// XCHG.
+// r,r.
+def WriteXCHG : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+}
+
+def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def WriteXCHGrm : SchedWriteRes<[]> {
+ let Latency = 21;
+ let NumMicroOps = 8;
+}
+def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+
+// XLAT.
+def WriteXLAT : SchedWriteRes<[]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteXLAT], (instregex "XLAT")>;
+
+// PUSH.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+
+// PUSHF.
+def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def WritePushA : SchedWriteRes<[]> {
+ let NumMicroOps = 19;
+}
+def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
+
+// POP.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+
+// POPF.
+def WritePopF : SchedWriteRes<[]> {
+ let NumMicroOps = 9;
+}
+def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
+
+// POPA.
+def WritePopA : SchedWriteRes<[]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+
+// LAHF SAHF.
+def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
+
+// BSWAP.
+// r32.
+def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
+def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
+
+// r64.
+def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
+
+// MOVBE.
+// r16,m16 / r64,m64.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
+
+// r32, m32.
+def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
+
+// m16,r16.
+def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
+
+// m32,r32.
+def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
+
+// m64,r64.
+def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+ (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+ "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// r,r/i.
+def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
+ "(ADC|SBB)(16|32|64)ri8",
+ "(ADC|SBB)64ri32",
+ "(ADC|SBB)(8|16|32|64)rr_REV")>;
+
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
+
+// m,r/i.
+def : InstRW<[Write3P0156_2P237_P4],
+ (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+ "(ADC|SBB)(16|32|64)mi8",
+ "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteP0156_2P237_P4],
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
+ "(INC|DEC)64(16|32)m")>;
+
+// MUL IMUL.
+// r16.
+def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
+
+// m16.
+def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
+
+// r32.
+def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
+
+// m32.
+def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
+
+// r64.
+def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
+
+// m64.
+def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
+
+// r16,r16.
+def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+
+// r16,m16.
+def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+
+// MULX.
+// r32,r32,r32.
+def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
+
+// r32,r32,m32.
+def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
+
+// r64,r64,r64.
+def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
+
+// r64,r64,m64.
+def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
+
+// DIV.
+// r8.
+def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 22;
+ let NumMicroOps = 9;
+}
+def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+
+// r16.
+def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 23;
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
+
+// r32.
+def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 22;
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
+
+// r64.
+def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 32;
+ let NumMicroOps = 36;
+}
+def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
+
+// IDIV.
+// r8.
+def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 23;
+ let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+
+// r16.
+def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 23;
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
+
+// r32.
+def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 22;
+ let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
+
+// r64.
+def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+ let Latency = 39;
+ let NumMicroOps = 59;
+}
+def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+ (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+ "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// SHR SHL SAR.
+// m,i.
+def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
+ let NumMicroOps = 6;
+ let ResourceCycles = [3, 2, 1];
+}
+def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
+
+// ROR ROL.
+// r,1.
+def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
+
+// m,i.
+def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 2, 1];
+}
+def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteRotateRMWCL : SchedWriteRes<[]> {
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
+
+// RCR RCL.
+// r,1.
+def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
+
+// m,1.
+def WriteRCm1 : SchedWriteRes<[]> {
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
+
+// r,i.
+def WriteRCri : SchedWriteRes<[HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+}
+def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+
+// m,i.
+def WriteRCmi : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+
+// SHRD SHLD.
+// r,r,i.
+def WriteShDrr : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
+
+// m,r,i.
+def WriteShDmr : SchedWriteRes<[]> {
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
+
+// r,r,cl.
+def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
+
+// m,r,cl.
+def WriteShDmrCL : SchedWriteRes<[]> {
+ let NumMicroOps = 7;
+}
+def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+// BT.
+// r,r/i.
+def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTmr : SchedWriteRes<[]> {
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTRSCmr : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
+
+// BSF BSR.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
+
+// SETcc.
+// r.
+def : InstRW<[WriteShift],
+ (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
+// m.
+def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteSetCCm],
+ (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+
+// CLD STD.
+def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
+
+// LZCNT TZCNT.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
+
+// ANDN.
+// r,r.
+def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
+
+// BEXTR.
+// r,r,r.
+def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
+// r,m,r.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
+
+// BZHI.
+// r,r,r.
+def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+
+// LOOP.
+def WriteLOOP : SchedWriteRes<[]> {
+ let NumMicroOps = 7;
+}
+def : InstRW<[WriteLOOP], (instregex "LOOP")>;
+
+// LOOP(N)E
+def WriteLOOPE : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
+
+// CALL.
+// r.
+def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
+
+// m.
+def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
+
+// RET.
+def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
+
+// i.
+def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+
+// BOUND.
+// r,m.
+def WriteBOUND : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+
+// INTO.
+def WriteINTO : SchedWriteRes<[]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteINTO], (instregex "INTO")>;
+
+//-- String instructions --//
+
+// LODSB/W.
+def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+
+// STOS.
+def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
+
+// MOVS.
+def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 1, 2];
+}
+def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
+
+// CMPS.
+def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 3];
+}
+def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+
+//-- Synchronization instructions --//
+
+// XADD.
+def WriteXADD : SchedWriteRes<[]> {
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
+
+// CMPXCHG.
+def WriteCMPXCHG : SchedWriteRes<[]> {
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
+
+// CMPXCHG8B.
+def WriteCMPXCHG8B : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+
+// CMPXCHG16B.
+def WriteCMPXCHG16B : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
+
+//-- Other --//
+
+// PAUSE.
+def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
+ let NumMicroOps = 5;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
+
+// LEAVE.
+def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
+
+// XGETBV.
+def WriteXGETBV : SchedWriteRes<[]> {
+ let NumMicroOps = 8;
+}
+def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
+
+// RDTSC.
+def WriteRDTSC : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
+
+// RDPMC.
+def WriteRDPMC : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+
+// RDRAND.
+def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+ let NumMicroOps = 17;
+ let ResourceCycles = [1, 16];
+}
+def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+// FLD.
+// m80.
+def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+
+def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 2];
+}
+def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+// m80.
+def WriteFBLD : SchedWriteRes<[]> {
+ let Latency = 47;
+ let NumMicroOps = 43;
+}
+def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
+ let NumMicroOps = 7;
+ let ResourceCycles = [3, 2, 2];
+}
+def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def WriteFBSTP : SchedWriteRes<[]> {
+ let NumMicroOps = 226;
+}
+def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
+
+// FXCHG.
+def : InstRW<[WriteNop], (instregex "XCH_F")>;
+
+// FILD.
+def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
+
+// FLDZ.
+def : InstRW<[WriteP01], (instregex "LD_F0")>;
+
+// FLD1.
+def : InstRW<[Write2P01], (instregex "LD_F1")>;
+
+// FLDPI FLDL2E etc.
+def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+
+// FCMOVcc.
+def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
+
+// FNSTSW.
+// AX.
+def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
+
+// m16.
+def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
+
+// FLDCW.
+def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
+
+// FNSTCW.
+def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
+
+// FFREE.
+def : InstRW<[WriteP01], (instregex "FFREE")>;
+
+// FNSAVE.
+def WriteFNSAVE : SchedWriteRes<[]> {
+ let NumMicroOps = 147;
+}
+def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def WriteFRSTOR : SchedWriteRes<[]> {
+ let NumMicroOps = 90;
+}
+def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+// FABS.
+def : InstRW<[WriteP0], (instregex "ABS_F")>;
+
+// FCHS.
+def : InstRW<[WriteP0], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
+ "UCOM_FPr")>;
+// m.
+def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
+ "UCOM_FIPr")>;
+
+// FICOM(P).
+def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+
+// FTST.
+def : InstRW<[WriteP1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Write2P1], (instregex "FXAM")>;
+
+// FPREM.
+def WriteFPREM : SchedWriteRes<[]> {
+ let Latency = 19;
+ let NumMicroOps = 28;
+}
+def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+
+// FPREM1.
+def WriteFPREM1 : SchedWriteRes<[]> {
+ let Latency = 27;
+ let NumMicroOps = 41;
+}
+def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+
+// FRNDINT.
+def WriteFRNDINT : SchedWriteRes<[]> {
+ let Latency = 11;
+ let NumMicroOps = 17;
+}
+def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+
+//-- Math instructions --//
+
+// FSCALE.
+def WriteFSCALE : SchedWriteRes<[]> {
+ let Latency = 75; // 49-125
+ let NumMicroOps = 50; // 25-75
+}
+def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+
+// FXTRACT.
+def WriteFXTRACT : SchedWriteRes<[]> {
+ let Latency = 15;
+ let NumMicroOps = 17;
+}
+def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
+
+//-- Other instructions --//
+
+// FNOP.
+def : InstRW<[WriteP01], (instregex "FNOP")>;
+
+// WAIT.
+def : InstRW<[Write2P01], (instregex "WAIT")>;
+
+// FNCLEX.
+def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
+
+// FNINIT.
+def WriteFNINIT : SchedWriteRes<[]> {
+ let NumMicroOps = 26;
+}
+def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+
+//=== Integer MMX and XMM Instructions ===//
+//-- Move instructions --//
+
+// MOVD.
+// r32/64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
+ "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+
+// (x)mm <- r32/64.
+def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
+ "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+
+// MOVQ.
+// r64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
+
+// (x)mm <- r64.
+def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+
+// (x)mm <- (x)mm.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
+
+// (V)MOVDQA/U.
+// x <- x.
+def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
+ "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
+ "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+
+// MOVDQ2Q.
+def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
+
+// MOVQ2DQ.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
+
+
+// PACKSSWB/DW.
+// mm <- mm.
+def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+
+// mm <- m64.
+def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+
+// VPMOVSX/ZX BW BD BQ DW DQ.
+// y <- x.
+def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
+def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
+
+// x,m,i / v,v,m,i
+def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
+ let NumMicroOps = 2;
+ let Latency = 4;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
+
+// VPBLENDD.
+// v,v,v,i.
+def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
+def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
+
+// v,v,m,i
+def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+ let NumMicroOps = 2;
+ let Latency = 4;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
+
+// MASKMOVQ.
+def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 2];
+}
+def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4, 2, 4];
+}
+def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOV D/Q.
+// v,v,m.
+def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
+ (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+
+// m, v,v.
+def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// PMOVMSKB.
+def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
+ let Latency = 3;
+}
+def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+
+// PEXTR B/W/D/Q.
+// r32,x,i.
+def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+
+// m8,x,i.
+def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
+ (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
+ (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHERDD.
+// x.
+def WriteVPGATHERDD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
+
+// y.
+def WriteVPGATHERDD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
+
+// VPGATHERQD.
+// x.
+def WriteVPGATHERQD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
+
+// y.
+def WriteVPGATHERQD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
+
+// VPGATHERDQ.
+// x.
+def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+
+// y.
+def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
+
+// VPGATHERQQ.
+// x.
+def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
+ let NumMicroOps = 14;
+}
+def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
+
+// y.
+def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+
+//-- Arithmetic instructions --//
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
+ "MMX_PHADDSWrr64",
+ "MMX_PHSUB(W|D)rr64",
+ "MMX_PHSUBSWrr64",
+ "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
+ "(V?)PH(ADD|SUB)SWrr(256)?")>;
+
+// v <- v,m.
+def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WritePHADDSUBm, ReadAfterLd],
+ (instregex "MMX_PHADD(W?)rm64",
+ "MMX_PHADDSWrm64",
+ "MMX_PHSUB(W|D)rm64",
+ "MMX_PHSUBSWrm64",
+ "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
+ "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+
+// PCMPGTQ.
+// v <- v,v.
+def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// v <- v,m.
+def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+
+// PMULLD.
+// x,x / y,y,y.
+def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+
+// x,m / y,y,m.
+def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+
+//-- Logic instructions --//
+
+// PTEST.
+// v,v.
+def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+
+// v,m.
+def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+
+// PSLL,PSRL DQ.
+def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+
+//-- Other --//
+
+// EMMS.
+def WriteEMMS : SchedWriteRes<[]> {
+ let Latency = 13;
+ let NumMicroOps = 31;
+}
+def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// MOVMSKP S/D.
+// r32 <- x.
+def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+
+// r32 <- y.
+def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
+ let Latency = 2;
+}
+def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+
+// VPERM2F128.
+def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
+def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+
+// BLENDVP S/D.
+def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
+def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+
+// VBROADCASTF128.
+def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+
+// EXTRACTPS.
+// r32,x,i.
+def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+// m32,x,i.
+def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+
+// m128,y,i.
+def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+
+// y,y,m128,i.
+def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+
+// VMASKMOVP S/D.
+// v,v,m.
+def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+
+// m128,x,x.
+def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+
+// m256,y,y.
+def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+
+// VGATHERDPS.
+// x.
+def WriteVGATHERDPS128 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+
+// y.
+def WriteVGATHERDPS256 : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def WriteVGATHERQPS128 : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+
+// y.
+def WriteVGATHERQPS256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def WriteVGATHERDPD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+
+// y.
+def WriteVGATHERDPD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 14;
+}
+def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+
+// y.
+def WriteVGATHERQPD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+
+//-- Conversion instructions --//
+
+// CVTPD2PS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+
+// x,y.
+def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+
+// x,m256.
+def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+
+// CVTSD2SS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+
+// x,m64.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+
+// CVTPS2PD.
+// x,x.
+def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+
+// x,m64.
+// y,m128.
+def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+
+// y,x.
+def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+
+// CVTSS2SD.
+// x,x.
+def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+
+// x,m32.
+def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
+
+// y,x.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
+
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// x,y.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
+
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+// CVSTSI2SS.
+// x,r32.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
+
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+
+// CVTSD2SI.
+// r32/64
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
+// m,v,i.
+def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
+
+// VCVTPH2PS.
+// v,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+
+// x,m / v,v,m.
+def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+
+// MULL SS/SD PS/PD.
+// x,x / v,v,v.
+def WriteMULr : SchedWriteRes<[HWPort01]> {
+ let Latency = 5;
+}
+def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+
+// x,m / v,v,m.
+def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+
+// VDIVPS.
+// y,y,y.
+def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 19; // 18-21 cycles.
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+
+// y,y,m256.
+def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 23; // 18-21 + 4 cycles.
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+
+// VDIVPD.
+// y,y,y.
+def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 27; // 19-35 cycles.
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+
+// y,y,m256.
+def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 31; // 19-35 + 4 cycles.
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+
+// VRCPPS.
+// y,y.
+def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+
+// y,m256.
+def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+
+// ROUND SS/SD PS/PD.
+// v,v,i.
+def WriteROUNDr : SchedWriteRes<[HWPort1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+
+// v,m,i.
+def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+
+// x,m,i / v,v,m,i.
+def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+ let Latency = 18;
+ let NumMicroOps = 6;
+ let ResourceCycles = [2, 1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+
+// DPPD.
+// x,x,i.
+def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+
+// x,m,i.
+def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+
+// VFMADD.
+// v,v,v.
+def WriteFMADDr : SchedWriteRes<[HWPort01]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+def : InstRW<[WriteFMADDr],
+ (instregex
+ // 3p forms.
+ "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
+ // 3s forms.
+ "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
+ // 4s/4s_int forms.
+ "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
+ // 4p forms.
+ "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+
+// v,v,m.
+def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFMADDm],
+ (instregex
+ // 3p forms.
+ "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
+ // 3s forms.
+ "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
+ // 4s/4s_int forms.
+ "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
+ // 4p forms.
+ "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+
+//-- Math instructions --//
+
+// VSQRTPS.
+// y,y.
+def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 19;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+
+// y,m256.
+def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 23;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+
+// VSQRTPD.
+// y,y.
+def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 28;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+
+// y,m256.
+def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 32;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+
+// RSQRT SS/PS.
+// x,x.
+def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+}
+def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+
+// x,m128.
+def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+
+// RSQRTPS 256.
+// y,y.
+def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+
+// y,m256.
+def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+
+//-- Logic instructions --//
+
+// AND, ANDN, OR, XOR PS/PD.
+// x,x / v,v,v.
+def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
+// x,m / v,v,m.
+def : InstRW<[WriteP5Ld, ReadAfterLd],
+ (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def WriteVZEROUPPER : SchedWriteRes<[]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+
+// VZEROALL.
+def WriteVZEROALL : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+
+// LDMXCSR.
+def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+
+// STMXCSR.
+def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 0000000..eca65c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,250 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+ // instructions per cycle.
+ // FIXME: Identify instructions that aren't a single fused micro-op.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 168; // Based on the reorder buffer.
+ let LoadLatency = 4;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size.
+ let LoopMicroOpBufferSize = 28;
+
+ // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+ let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [SBPort015]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SBWriteResPair<WriteALU, SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : SBWriteResPair<WriteShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteJump, SBPort5, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+defm : SBWriteResPair<WriteFShuffle, SBPort5, 1>;
+defm : SBWriteResPair<WriteFBlend, SBPort05, 1>;
+def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1];
+}
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU, SBPort15, 1>;
+defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteShuffle, SBPort15, 1>;
+defm : SBWriteResPair<WriteBlend, SBPort15, 1>;
+def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 1, 1];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+ let Latency = 11;
+ let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort015]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
+ let Latency = 3;
+ let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+ let Latency = 4;
+ let ResourceCycles = [7, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
+ let Latency = 8;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
+ let Latency = 8;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+ let Latency = 8;
+ let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+ let Latency = 14;
+ let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [17, 1];
+}
+
+
+def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2 is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>;
+defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>;
+defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 0000000..a261356
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,650 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+ // The SchedWrite to use when a load is folded into the instruction.
+ SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+ // Register-Memory operation.
+ def Ld : SchedWrite;
+ // Register-Register operation.
+ def NAME : X86FoldableSchedWrite {
+ let Folded = !cast<SchedWrite>(NAME#"Ld");
+ }
+}
+
+// Arithmetic.
+defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def WriteIMulH : SchedWrite; // Integer multiplication, high part.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
+defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
+defm WriteShuffle : X86SchedWritePair; // Vector shuffles.
+defm WriteBlend : X86SchedWritePair; // Vector blends.
+defm WriteVarBlend : X86SchedWritePair; // Vector variable blends.
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair;
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM : InstrItinClass;
+def IIC_ALU_NONMEM : InstrItinClass;
+def IIC_LEA : InstrItinClass;
+def IIC_LEA_16 : InstrItinClass;
+def IIC_MUL8 : InstrItinClass;
+def IIC_MUL16_MEM : InstrItinClass;
+def IIC_MUL16_REG : InstrItinClass;
+def IIC_MUL32_MEM : InstrItinClass;
+def IIC_MUL32_REG : InstrItinClass;
+def IIC_MUL64 : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8 : InstrItinClass;
+def IIC_IMUL16_MEM : InstrItinClass;
+def IIC_IMUL16_REG : InstrItinClass;
+def IIC_IMUL32_MEM : InstrItinClass;
+def IIC_IMUL32_REG : InstrItinClass;
+def IIC_IMUL64 : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM : InstrItinClass;
+def IIC_IMUL16_RR : InstrItinClass;
+def IIC_IMUL32_RM : InstrItinClass;
+def IIC_IMUL32_RR : InstrItinClass;
+def IIC_IMUL64_RM : InstrItinClass;
+def IIC_IMUL64_RR : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI : InstrItinClass;
+def IIC_IMUL16_RRI : InstrItinClass;
+def IIC_IMUL32_RMI : InstrItinClass;
+def IIC_IMUL32_RRI : InstrItinClass;
+def IIC_IMUL64_RMI : InstrItinClass;
+def IIC_IMUL64_RRI : InstrItinClass;
+// div
+def IIC_DIV8_MEM : InstrItinClass;
+def IIC_DIV8_REG : InstrItinClass;
+def IIC_DIV16 : InstrItinClass;
+def IIC_DIV32 : InstrItinClass;
+def IIC_DIV64 : InstrItinClass;
+// idiv
+def IIC_IDIV8 : InstrItinClass;
+def IIC_IDIV16 : InstrItinClass;
+def IIC_IDIV32 : InstrItinClass;
+def IIC_IDIV64 : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG : InstrItinClass;
+def IIC_UNARY_MEM : InstrItinClass;
+// add/sub/and/or/xor/sbc/cmp/test
+def IIC_BIN_MEM : InstrItinClass;
+def IIC_BIN_NONMEM : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM : InstrItinClass;
+// shift/rotate
+def IIC_SR : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR : InstrItinClass;
+def IIC_SSE_BIT_P_RM : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR : InstrItinClass;
+def IIC_SSE_INTALU_P_RM : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
+
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+def IIC_AES : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analogous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericModel contains no instruction itineraries.
+def GenericModel : SchedMachineModel {
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 32;
+ let LoadLatency = 4;
+ let HighLatency = 10;
+ let PostRAScheduler = 0;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
+include "X86ScheduleBtVer2.td"
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 0000000..4c559c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,549 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+// Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+ // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+ // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+ [ Port0, Port1 ],
+ [], [
+ // P0 only
+ // InstrItinData<class, [InstrStage<N, [P0]>] >,
+ // P0 or P1
+ // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+ // P0 and P1
+ // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >,
+ //
+ // Default is 1 cycle, port0 or port1
+ InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+ // mul
+ InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul by al, ax, eax, rax
+ InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg by reg|mem
+ InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg = reg/mem * imm
+ InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+ // idiv
+ InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // div
+ InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // neg/not/inc/dec
+ InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+ // add/sub/and/or/xor/cmp/test
+ InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+ // adc/sbc
+ InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
+ // shift/rotate
+ InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+ // shift double
+ InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+ // cmov
+ InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // set
+ InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+ // jcc
+ InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+ // jcxz/jecxz/jrcxz
+ InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+ // jmp rel
+ InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+ // jmp indirect
+ InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ // jmp far
+ InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+ // loop/loope/loopne
+ InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+ // call - all but reg/imm
+ InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+ //ret
+ InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+ InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+ //sign extension movs
+ InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+ InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+ //zero extension movs
+ InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+ InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+ // SSE binary operations
+ // arithmetic fp scalar
+ InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // arithmetic fp parallel
+ InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+ // bitwise parallel
+ InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+ // arithmetic int parallel
+ InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+ // multiply int parallel
+ InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+ // shift parallel
+ InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+ // conversions
+ // to/from PD ...
+ InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // to/from PS except to/from PD and PS2PI
+ InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // MMX MOVs
+ InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // other MMX
+ InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+ // conversions
+ // from/to PD
+ InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // from/to PI
+ InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>]>,
+
+ InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+ InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >,
+ InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+ // System instructions
+ InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >,
+ InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+ InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >,
+ InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >,
+ InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+ InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+ // worst case for mov REG_CRx
+ InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+ // LAR
+ InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >,
+ // LSL
+ InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+ // push control register, segment registers
+ InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+ // pop control register, segment registers
+ InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+ // VERR, VERW
+ InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+ // WRMSR, RDMSR
+ InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+ // SMSW, LMSW
+ InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+ InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+ InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+ ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+ let IssueWidth = 2; // Allows 2 instructions per scheduling group.
+ let MicroOpBufferSize = 0; // In-order execution, always hide latency.
+ let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+ let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+ // On the Atom, the throughput for taken branches is 2 cycles. For small
+ // simple loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+ let PostRAScheduler = 1;
+
+ let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
new file mode 100644
index 0000000..ce1ece3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -0,0 +1,341 @@
+//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD btver2 (Jaguar) to support
+// instruction scheduling and other instruction cost heuristics. Based off AMD Software
+// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
+//
+//===----------------------------------------------------------------------===//
+
+def BtVer2Model : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and btver2 can
+ // decode 2 instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 64; // Retire Control Unit
+ let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
+ let HighLatency = 25;
+ let MispredictPenalty = 14; // Minimum branch misdirection penalty
+ let PostRAScheduler = 1;
+
+ // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = BtVer2Model in {
+
+// Jaguar can issue up to 6 micro-ops in one cycle
+def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
+def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
+def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
+def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
+def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
+def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
+
+// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
+def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+
+// Integer Pipe Scheduler
+def JALU01 : ProcResGroup<[JALU0, JALU1]> {
+ let BufferSize=20;
+}
+
+// AGU Pipe Scheduler
+def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
+ let BufferSize=12;
+}
+
+// Fpu Pipe Scheduler
+def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
+ let BufferSize=18;
+}
+
+def JDiv : ProcResource<1>; // integer division
+def JMul : ProcResource<1>; // integer multiplication
+def JVALU0 : ProcResource<1>; // vector integer
+def JVALU1 : ProcResource<1>; // vector integer
+def JVIMUL : ProcResource<1>; // vector integer multiplication
+def JSTC : ProcResource<1>; // vector store/convert
+def JFPM : ProcResource<1>; // FP multiplication
+def JFPA : ProcResource<1>; // FP addition
+
+// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+ let Latency = !add(Lat, 3);
+ }
+}
+
+multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+ let Latency = !add(Lat, 5);
+ }
+}
+
+// A folded store needs a cycle on the SAGU for the store data.
+def : WriteRes<WriteRMW, [JSAGU]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteALU, JALU01, 1>;
+defm : JWriteResIntPair<WriteIMul, JALU1, 3>;
+
+def : WriteRes<WriteIMulH, [JALU1]> {
+ let Latency = 6;
+ let ResourceCycles = [4];
+}
+
+// FIXME 8/16 bit divisions
+def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
+ let Latency = 41;
+ let ResourceCycles = [1, 1, 25];
+}
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA, [JALU01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+// FIXME: Split x86 and SSE load/store/moves
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteMove, [JAny]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteJump, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
+// FIXME: Double precision latencies
+// FIXME: SS vs PS latencies
+// FIXME: ymm latencies
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
+defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 21;
+ let ResourceCycles = [1, 1, 21];
+}
+def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 26;
+ let ResourceCycles = [1, 1, 21];
+}
+
+def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 19;
+ let ResourceCycles = [1, 1, 19];
+}
+def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
+ let Latency = 24;
+ let ResourceCycles = [1, 1, 19];
+}
+
+// FIXME: integer pipes
+defm : JWriteResFpuPair<WriteCvtF2I, JFPU1, 3>; // Float -> Integer.
+defm : JWriteResFpuPair<WriteCvtI2F, JFPU1, 3>; // Integer -> Float.
+defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conversion.
+
+def : WriteRes<WriteFVarBlend, [JFPU01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2];
+}
+
+// Vector integer operations.
+defm : JWriteResFpuPair<WriteVecALU, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteVecShift, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteVecIMul, JFPU0, 2>;
+defm : JWriteResFpuPair<WriteShuffle, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteBlend, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteVecLogic, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteVarBlend, [JFPU01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2];
+}
+
+// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
+def : WriteRes<WriteVarVecShift, [JFPU01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [JFPU0]> {
+ let Latency = 3;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// FIXME: approximate latencies + pipe dependencies
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WritePCmpIStrM, [JFPU01]> {
+ let Latency = 7;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> {
+ let Latency = 12;
+ let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [JFPU01]> {
+ let Latency = 13;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> {
+ let Latency = 18;
+ let ResourceCycles = [1, 5];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [JFPU01]> {
+ let Latency = 13;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> {
+ let Latency = 18;
+ let ResourceCycles = [1, 5];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [JVIMUL]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteCLMul, [JVIMUL]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+// FIXME: pipe for system/microcode?
+def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteFence, [JSAGU]>;
+def : WriteRes<WriteNop, []>;
+} // SchedModel
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 0000000..f95d4fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,233 @@
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SLMModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+ // instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 32; // Based on the reorder buffer.
+ let LoadLatency = 3;
+ let MispredictPenalty = 10;
+ let PostRAScheduler = 1;
+
+ // For small loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
+ // FIXME: SSE4 is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01 : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01 : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+ let Latency = !add(Lat, 3);
+ }
+}
+
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove, [IEC_RSV01]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>;
+defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 25];
+}
+
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+ let Latency = 5;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 1, 2];
+}
+
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+ let Latency = 37;
+ let ResourceCycles = [1, 1, 34];
+}
+
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>;
+defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 13;
+ let ResourceCycles = [13, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 17;
+ let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 17;
+ let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+ let Latency = 21;
+ let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 21;
+ let ResourceCycles = [21, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+ let Latency = 8;
+ let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 8;
+ let ResourceCycles = [5, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+ let Latency = 10;
+ let ResourceCycles = [10, 1];
+}
+
+
+def : WriteRes<WriteSystem, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 0000000..b1a0161
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,284 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+
+bool X86SelectionDAGInfo::isBaseRegConflictPossible(
+ SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const {
+ // We cannot use TRI->hasBasePointer() until *after* we select all basic
+ // blocks. Legalization may introduce new stack temporaries with large
+ // alignment requirements. Fall back to generic code if there are any
+ // dynamic stack adjustments (hopefully rare) and the base pointer would
+ // conflict if we had to use it.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment())
+ return false;
+
+ const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
+ DAG.getSubtarget().getRegisterInfo());
+ unsigned BaseReg = TRI->getBaseRegister();
+ for (unsigned R : ClobberSet)
+ if (BaseReg == R)
+ return true;
+ return false;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+
+#ifndef NDEBUG
+ // If the base register might conflict with our physical registers, bail out.
+ const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+ X86::ECX, X86::EAX, X86::EDI};
+ assert(!isBaseRegConflictPossible(DAG, ClobberSet));
+#endif
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If not DWORD aligned or size is more than the threshold, call the library.
+ // The libc version is likely to be faster for these cases. It can use the
+ // address value and run time information about the CPU.
+ if ((Align & 3) != 0 || !ConstantSize ||
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+ if (const char *bzeroEntry = V &&
+ V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args),
+ 0)
+ .setDiscardResult();
+
+ std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memset.
+ return SDValue();
+ }
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ SDValue InFlag;
+ EVT AVT;
+ SDValue Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+ unsigned BytesLeft = 0;
+ bool TwoRepStos = false;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getZExtValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = DAG.getIntPtrConstant(SizeVal, dl);
+ break;
+ }
+
+ if (AVT.bitsGT(MVT::i8)) {
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
+ BytesLeft = SizeVal % UBytes;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = DAG.getIntPtrConstant(SizeVal, dl);
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+
+ if (TwoRepStos) {
+ InFlag = Chain.getValue(1);
+ Count = Size;
+ EVT CVT = Count.getValueType();
+ SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
+ CVT));
+ Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+ } else if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT AddrVT = Dst.getValueType();
+ EVT SizeVT = Size.getValueType();
+
+ Chain = DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, dl, AddrVT)),
+ Src,
+ DAG.getConstant(BytesLeft, dl, SizeVT),
+ Align, isVolatile, false,
+ DstPtrInfo.getWithOffset(Offset));
+ }
+
+ // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+ return Chain;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // This requires the copy size to be a constant, preferably
+ // within a subtarget-specific limit.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+ if (!ConstantSize)
+ return SDValue();
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+ return SDValue();
+
+ /// If not DWORD aligned, it is more efficient to call the library. However
+ /// if calling the library is not allowed (AlwaysInline), then soldier on as
+ /// the code generated here is better than the long load-store sequence we
+ /// would otherwise get.
+ if (!AlwaysInline && (Align & 3) != 0)
+ return SDValue();
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256 ||
+ SrcPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If the base register might conflict with our physical registers, bail out.
+ const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+ X86::ECX, X86::ESI, X86::EDI};
+ if (isBaseRegConflictPossible(DAG, ClobberSet))
+ return SDValue();
+
+ MVT AVT;
+ if (Align & 1)
+ AVT = MVT::i8;
+ else if (Align & 2)
+ AVT = MVT::i16;
+ else if (Align & 4)
+ // DWORD aligned
+ AVT = MVT::i32;
+ else
+ // QWORD aligned
+ AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ unsigned CountVal = SizeVal / UBytes;
+ SDValue Count = DAG.getIntPtrConstant(CountVal, dl);
+ unsigned BytesLeft = SizeVal % UBytes;
+
+ SDValue InFlag;
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+ Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ EVT SizeVT = Size.getValueType();
+ Results.push_back(DAG.getMemcpy(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+ DAG.getConstant(Offset, dl,
+ DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+ DAG.getConstant(Offset, dl,
+ SrcVT)),
+ DAG.getConstant(BytesLeft, dl, SizeVT),
+ Align, isVolatile, AlwaysInline, false,
+ DstPtrInfo.getWithOffset(Offset),
+ SrcPtrInfo.getWithOffset(Offset)));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 0000000..961bd8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,52 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+ /// Returns true if it is possible for the base register to conflict with the
+ /// given set of clobbers for a memory intrinsic.
+ bool isBaseRegConflictPossible(SelectionDAG &DAG,
+ ArrayRef<unsigned> ClobberSet) const;
+
+public:
+ explicit X86SelectionDAGInfo() = default;
+
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 0000000..ef16c5b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,190 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+ // constant pool uniques constants by their bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+
+#ifndef NDEBUG
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
+#endif
+
+ // This is a straightforward byte vector.
+ if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
+ int NumElements = MaskTy->getVectorNumElements();
+ ShuffleMask.reserve(NumElements);
+
+ for (int i = 0; i < NumElements; ++i) {
+ // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+ // lane of the vector we're inside.
+ int Base = i & ~0xf;
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ } else if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (Element & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (Element & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+ }
+ // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+ // constant pool uniques constants by their bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+ if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+ return;
+
+ // Only support vector types.
+ if (!MaskTy->isVectorTy())
+ return;
+
+ // Make sure its an integer type.
+ Type *VecEltTy = MaskTy->getVectorElementType();
+ if (!VecEltTy->isIntegerTy())
+ return;
+
+ // Support any element type from byte up to element size.
+ // This is necesary primarily because 64-bit elements get split to 32-bit
+ // in the constant pool on 32-bit target.
+ unsigned EltTySize = VecEltTy->getIntegerBitWidth();
+ if (EltTySize < 8 || EltTySize > ElSize)
+ return;
+
+ unsigned NumElements = MaskTySize / ElSize;
+ assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+ "Unexpected number of vector elements.");
+ ShuffleMask.reserve(NumElements);
+ unsigned NumElementsPerLane = 128 / ElSize;
+ unsigned Factor = ElSize / EltTySize;
+
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i * Factor);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ } else if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ int Index = i & ~(NumElementsPerLane - 1);
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ if (ElSize == 64)
+ Index += (Element >> 1) & 0x1;
+ else
+ Index += Element & 0x3;
+ ShuffleMask.push_back(Index);
+ }
+
+ // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ if (MaskTy->isVectorTy()) {
+ unsigned NumElements = MaskTy->getVectorNumElements();
+ if (NumElements == VT.getVectorNumElements()) {
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
+ ShuffleMask.clear();
+ return;
+ }
+ if (isa<UndefValue>(COp))
+ ShuffleMask.push_back(SM_SentinelUndef);
+ else {
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ Element &= (1 << NumElements) - 1;
+ ShuffleMask.push_back(Element);
+ }
+ }
+ }
+ return;
+ }
+ // Scalar value; just broadcast it
+ if (!isa<ConstantInt>(C))
+ return;
+ uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
+ int NumElements = VT.getVectorNumElements();
+ Element &= (1 << NumElements) - 1;
+ for (int i = 0; i < NumElements; ++i)
+ ShuffleMask.push_back(Element);
+}
+
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned NumElements = MaskTy->getVectorNumElements();
+ if (NumElements == VT.getVectorNumElements()) {
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ }
+ if (isa<UndefValue>(COp))
+ ShuffleMask.push_back(SM_SentinelUndef);
+ else {
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ Element &= (1 << NumElements*2) - 1;
+ ShuffleMask.push_back(Element);
+ }
+ }
+ }
+}
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 0000000..bcf4632
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,45 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..8ef08c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,343 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+ cl::desc("Enable early if-conversion on X86"));
+
+
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
+ if (isPICStyleGOT()) // 32-bit ELF targets.
+ return X86II::MO_GOTOFF;
+
+ if (isPICStyleStubPIC()) // Darwin/32 in PIC mode.
+ return X86II::MO_PIC_BASE_OFFSET;
+
+ // Direct static reference to label.
+ return X86II::MO_NO_FLAG;
+}
+
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+ // DLLImport only exists on windows, it is implemented as a load from a
+ // DLLIMPORT stub.
+ if (GV->hasDLLImportStorageClass())
+ return X86II::MO_DLLIMPORT;
+
+ bool isDef = GV->isStrongDefinitionForLinker();
+
+ // X86-64 in PIC mode.
+ if (isPICStyleRIPRel()) {
+ // Large model never uses stubs.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return X86II::MO_NO_FLAG;
+
+ if (isTargetDarwin()) {
+ // If symbol visibility is hidden, the extra load is not needed if
+ // target is x86-64 or the symbol is definitely defined in the current
+ // translation unit.
+ if (GV->hasDefaultVisibility() && !isDef)
+ return X86II::MO_GOTPCREL;
+ } else if (!isTargetWin64()) {
+ assert(isTargetELF() && "Unknown rip-relative target");
+
+ // Extra load is needed for all externally visible.
+ if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+ return X86II::MO_GOTPCREL;
+ }
+
+ return X86II::MO_NO_FLAG;
+ }
+
+ if (isPICStyleGOT()) { // 32-bit ELF targets.
+ // Extra load is needed for all externally visible.
+ if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+ return X86II::MO_GOTOFF;
+ return X86II::MO_GOT;
+ }
+
+ if (isPICStyleStubPIC()) { // Darwin/32 in PIC mode.
+ // Determine whether we have a stub reference and/or whether the reference
+ // is relative to the PIC base or not.
+
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (isDef)
+ return X86II::MO_PIC_BASE_OFFSET;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+ // If symbol visibility is hidden, we have a stub for common symbol
+ // references and external declarations.
+ if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) {
+ // Hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+ }
+
+ // Otherwise, no stub.
+ return X86II::MO_PIC_BASE_OFFSET;
+ }
+
+ if (isPICStyleStubNoDynamic()) { // Darwin/32 in -mdynamic-no-pic mode.
+ // Determine whether we have a stub reference.
+
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (isDef)
+ return X86II::MO_NO_FLAG;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_NONLAZY;
+
+ // Otherwise, no stub.
+ return X86II::MO_NO_FLAG;
+ }
+
+ // Direct static reference to global.
+ return X86II::MO_NO_FLAG;
+}
+
+
+/// This function returns the name of a function which has an interface like
+/// the non-standard bzero function, if such a function exists on the
+/// current subtarget and it is considered preferable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+ // Darwin 10 has a __bzero entry point for this purpose.
+ if (getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 6))
+ return "__bzero";
+
+ return nullptr;
+}
+
+bool X86Subtarget::hasSinCos() const {
+ return getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+ is64Bit();
+}
+
+/// Return true if the subtarget allows calls to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+ // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+ // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
+ // the following check for Win32 should be removed.
+ if (In64BitMode || isTargetWin32())
+ return false;
+ return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "generic";
+
+ // Make sure 64-bit features are available in 64-bit mode. (But make sure
+ // SSE2 can be turned off explicitly.)
+ std::string FullFS = FS;
+ if (In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+64bit,+sse2," + FullFS;
+ else
+ FullFS = "+64bit,+sse2";
+ }
+
+ // LAHF/SAHF are always supported in non-64-bit mode.
+ if (!In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+sahf," + FullFS;
+ else
+ FullFS = "+sahf";
+ }
+
+
+ // Parse features string and set the CPU.
+ ParseSubtargetFeatures(CPUName, FullFS);
+
+ // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+ // 16-bytes and under that are reasonably fast. These features were
+ // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+ // micro-architectures respectively.
+ if (hasSSE42() || hasSSE4A())
+ IsUAMem16Slow = false;
+
+ InstrItins = getInstrItineraryForCPU(CPUName);
+
+ // It's important to keep the MCSubtargetInfo feature bits in sync with
+ // target data structure which is shared with MC code emitter, etc.
+ if (In64BitMode)
+ ToggleFeature(X86::Mode64Bit);
+ else if (In32BitMode)
+ ToggleFeature(X86::Mode32Bit);
+ else if (In16BitMode)
+ ToggleFeature(X86::Mode16Bit);
+ else
+ llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
+
+ DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel
+ << ", 64bit " << HasX86_64 << "\n");
+ assert((!In64BitMode || HasX86_64) &&
+ "64-bit code requested on a subtarget that doesn't support it!");
+
+ // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
+ // 32 and 64 bit) and for all 64-bit targets.
+ if (StackAlignOverride)
+ stackAlignment = StackAlignOverride;
+ else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+ In64BitMode)
+ stackAlignment = 16;
+}
+
+void X86Subtarget::initializeEnvironment() {
+ X86SSELevel = NoSSE;
+ X863DNowLevel = NoThreeDNow;
+ HasCMov = false;
+ HasX86_64 = false;
+ HasPOPCNT = false;
+ HasSSE4A = false;
+ HasAES = false;
+ HasFXSR = false;
+ HasXSAVE = false;
+ HasXSAVEOPT = false;
+ HasXSAVEC = false;
+ HasXSAVES = false;
+ HasPCLMUL = false;
+ HasFMA = false;
+ HasFMA4 = false;
+ HasXOP = false;
+ HasTBM = false;
+ HasMOVBE = false;
+ HasRDRAND = false;
+ HasF16C = false;
+ HasFSGSBase = false;
+ HasLZCNT = false;
+ HasBMI = false;
+ HasBMI2 = false;
+ HasRTM = false;
+ HasHLE = false;
+ HasERI = false;
+ HasCDI = false;
+ HasPFI = false;
+ HasDQI = false;
+ HasBWI = false;
+ HasVLX = false;
+ HasADX = false;
+ HasPKU = false;
+ HasSHA = false;
+ HasPRFCHW = false;
+ HasRDSEED = false;
+ HasLAHFSAHF = false;
+ HasMPX = false;
+ IsBTMemSlow = false;
+ IsSHLDSlow = false;
+ IsUAMem16Slow = false;
+ IsUAMem32Slow = false;
+ HasSSEUnalignedMem = false;
+ HasCmpxchg16b = false;
+ UseLeaForSP = false;
+ HasSlowDivide32 = false;
+ HasSlowDivide64 = false;
+ PadShortFunctions = false;
+ CallRegIndirect = false;
+ LEAUsesAG = false;
+ SlowLEA = false;
+ SlowIncDec = false;
+ stackAlignment = 4;
+ // FIXME: this is a known good value for Yonah. How about others?
+ MaxInlineSizeThreshold = 128;
+ UseSoftFloat = false;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ initializeEnvironment();
+ initSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
+X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU,
+ const std::string &FS, const X86TargetMachine &TM,
+ unsigned StackAlignOverride)
+ : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+ PICStyle(PICStyles::None), TargetTriple(TT),
+ StackAlignOverride(StackAlignOverride),
+ In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+ In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+ TargetTriple.getEnvironment() != Triple::CODE16),
+ In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+ TargetTriple.getEnvironment() == Triple::CODE16),
+ TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+ TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+ // Determine the PICStyle based on the target selected.
+ if (TM.getRelocationModel() == Reloc::Static) {
+ // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+ setPICStyle(PICStyles::None);
+ } else if (is64Bit()) {
+ // PIC in 64 bit mode is always rip-rel.
+ setPICStyle(PICStyles::RIPRel);
+ } else if (isTargetCOFF()) {
+ setPICStyle(PICStyles::None);
+ } else if (isTargetDarwin()) {
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ setPICStyle(PICStyles::StubPIC);
+ else {
+ assert(TM.getRelocationModel() == Reloc::DynamicNoPIC);
+ setPICStyle(PICStyles::StubDynamicNoPIC);
+ }
+ } else if (isTargetELF()) {
+ setPICStyle(PICStyles::GOT);
+ }
+}
+
+bool X86Subtarget::enableEarlyIfConversion() const {
+ return hasCMov() && X86EarlyIfConv;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..13d1026
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,546 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+ StubPIC, // Used on i386-darwin in -fPIC mode.
+ StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+ GOT, // Used on many 32-bit unices in -fPIC mode.
+ RIPRel, // Used on X86-64 when not in -static mode.
+ None // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget final : public X86GenSubtargetInfo {
+
+protected:
+ enum X86SSEEnum {
+ NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
+ };
+
+ enum X86ProcFamilyEnum {
+ Others, IntelAtom, IntelSLM
+ };
+
+ /// X86 processor family: Intel Atom, and others
+ X86ProcFamilyEnum X86ProcFamily;
+
+ /// Which PIC style to use
+ PICStyles::Style PICStyle;
+
+ /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+ X86SSEEnum X86SSELevel;
+
+ /// MMX, 3DNow, 3DNow Athlon, or none supported.
+ X863DNowEnum X863DNowLevel;
+
+ /// True if this processor has conditional move instructions
+ /// (generally pentium pro+).
+ bool HasCMov;
+
+ /// True if the processor supports X86-64 instructions.
+ bool HasX86_64;
+
+ /// True if the processor supports POPCNT.
+ bool HasPOPCNT;
+
+ /// True if the processor supports SSE4A instructions.
+ bool HasSSE4A;
+
+ /// Target has AES instructions
+ bool HasAES;
+
+ /// Target has FXSAVE/FXRESTOR instructions
+ bool HasFXSR;
+
+ /// Target has XSAVE instructions
+ bool HasXSAVE;
+ /// Target has XSAVEOPT instructions
+ bool HasXSAVEOPT;
+ /// Target has XSAVEC instructions
+ bool HasXSAVEC;
+ /// Target has XSAVES instructions
+ bool HasXSAVES;
+
+ /// Target has carry-less multiplication
+ bool HasPCLMUL;
+
+ /// Target has 3-operand fused multiply-add
+ bool HasFMA;
+
+ /// Target has 4-operand fused multiply-add
+ bool HasFMA4;
+
+ /// Target has XOP instructions
+ bool HasXOP;
+
+ /// Target has TBM instructions.
+ bool HasTBM;
+
+ /// True if the processor has the MOVBE instruction.
+ bool HasMOVBE;
+
+ /// True if the processor has the RDRAND instruction.
+ bool HasRDRAND;
+
+ /// Processor has 16-bit floating point conversion instructions.
+ bool HasF16C;
+
+ /// Processor has FS/GS base insturctions.
+ bool HasFSGSBase;
+
+ /// Processor has LZCNT instruction.
+ bool HasLZCNT;
+
+ /// Processor has BMI1 instructions.
+ bool HasBMI;
+
+ /// Processor has BMI2 instructions.
+ bool HasBMI2;
+
+ /// Processor has RTM instructions.
+ bool HasRTM;
+
+ /// Processor has HLE.
+ bool HasHLE;
+
+ /// Processor has ADX instructions.
+ bool HasADX;
+
+ /// Processor has SHA instructions.
+ bool HasSHA;
+
+ /// Processor has PRFCHW instructions.
+ bool HasPRFCHW;
+
+ /// Processor has RDSEED instructions.
+ bool HasRDSEED;
+
+ /// Processor has LAHF/SAHF instructions.
+ bool HasLAHFSAHF;
+
+ /// True if BT (bit test) of memory instructions are slow.
+ bool IsBTMemSlow;
+
+ /// True if SHLD instructions are slow.
+ bool IsSHLDSlow;
+
+ /// True if unaligned memory accesses of 16-bytes are slow.
+ bool IsUAMem16Slow;
+
+ /// True if unaligned memory accesses of 32-bytes are slow.
+ bool IsUAMem32Slow;
+
+ /// True if SSE operations can have unaligned memory operands.
+ /// This may require setting a configuration bit in the processor.
+ bool HasSSEUnalignedMem;
+
+ /// True if this processor has the CMPXCHG16B instruction;
+ /// this is true for most x86-64 chips, but not the first AMD chips.
+ bool HasCmpxchg16b;
+
+ /// True if the LEA instruction should be used for adjusting
+ /// the stack pointer. This is an optimization for Intel Atom processors.
+ bool UseLeaForSP;
+
+ /// True if 8-bit divisions are significantly faster than
+ /// 32-bit divisions and should be used when possible.
+ bool HasSlowDivide32;
+
+ /// True if 16-bit divides are significantly faster than
+ /// 64-bit divisions and should be used when possible.
+ bool HasSlowDivide64;
+
+ /// True if the short functions should be padded to prevent
+ /// a stall when returning too early.
+ bool PadShortFunctions;
+
+ /// True if the Calls with memory reference should be converted
+ /// to a register-based indirect call.
+ bool CallRegIndirect;
+
+ /// True if the LEA instruction inputs have to be ready at address generation
+ /// (AG) time.
+ bool LEAUsesAG;
+
+ /// True if the LEA instruction with certain arguments is slow
+ bool SlowLEA;
+
+ /// True if INC and DEC instructions are slow when writing to flags
+ bool SlowIncDec;
+
+ /// Processor has AVX-512 PreFetch Instructions
+ bool HasPFI;
+
+ /// Processor has AVX-512 Exponential and Reciprocal Instructions
+ bool HasERI;
+
+ /// Processor has AVX-512 Conflict Detection Instructions
+ bool HasCDI;
+
+ /// Processor has AVX-512 Doubleword and Quadword instructions
+ bool HasDQI;
+
+ /// Processor has AVX-512 Byte and Word instructions
+ bool HasBWI;
+
+ /// Processor has AVX-512 Vector Length eXtenstions
+ bool HasVLX;
+
+ /// Processor has PKU extenstions
+ bool HasPKU;
+
+ /// Processot supports MPX - Memory Protection Extensions
+ bool HasMPX;
+
+ /// Use software floating point for code generation.
+ bool UseSoftFloat;
+
+ /// The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment;
+
+ /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ ///
+ unsigned MaxInlineSizeThreshold;
+
+ /// What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ /// Instruction itineraries for scheduling
+ InstrItineraryData InstrItins;
+
+private:
+
+ /// Override the stack alignment.
+ unsigned StackAlignOverride;
+
+ /// True if compiling for 64-bit, false for 16-bit or 32-bit.
+ bool In64BitMode;
+
+ /// True if compiling for 32-bit, false for 16-bit or 64-bit.
+ bool In32BitMode;
+
+ /// True if compiling for 16-bit, false for 32-bit or 64-bit.
+ bool In16BitMode;
+
+ X86SelectionDAGInfo TSInfo;
+ // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+ // X86TargetLowering needs.
+ X86InstrInfo InstrInfo;
+ X86TargetLowering TLInfo;
+ X86FrameLowering FrameLowering;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ X86Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+ const X86TargetMachine &TM, unsigned StackAlignOverride);
+
+ const X86TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+ const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const X86FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const X86RegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+
+ /// Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+private:
+ /// Initialize the full set of dependencies so we can use an initializer
+ /// list for X86Subtarget.
+ X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+ void initializeEnvironment();
+ void initSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+ /// Is this x86_64? (disregarding specific ABI / programming model)
+ bool is64Bit() const {
+ return In64BitMode;
+ }
+
+ bool is32Bit() const {
+ return In32BitMode;
+ }
+
+ bool is16Bit() const {
+ return In16BitMode;
+ }
+
+ /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+ bool isTarget64BitILP32() const {
+ return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+ TargetTriple.isOSNaCl());
+ }
+
+ /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+ bool isTarget64BitLP64() const {
+ return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
+ !TargetTriple.isOSNaCl());
+ }
+
+ PICStyles::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
+
+ bool hasCMov() const { return HasCMov; }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool hasSSE41() const { return X86SSELevel >= SSE41; }
+ bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasAVX() const { return X86SSELevel >= AVX; }
+ bool hasAVX2() const { return X86SSELevel >= AVX2; }
+ bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+ bool hasFp256() const { return hasAVX(); }
+ bool hasInt256() const { return hasAVX2(); }
+ bool hasSSE4A() const { return HasSSE4A; }
+ bool hasMMX() const { return X863DNowLevel >= MMX; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+ bool hasPOPCNT() const { return HasPOPCNT; }
+ bool hasAES() const { return HasAES; }
+ bool hasFXSR() const { return HasFXSR; }
+ bool hasXSAVE() const { return HasXSAVE; }
+ bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+ bool hasXSAVEC() const { return HasXSAVEC; }
+ bool hasXSAVES() const { return HasXSAVES; }
+ bool hasPCLMUL() const { return HasPCLMUL; }
+ // Prefer FMA4 to FMA - its better for commutation/memory folding and
+ // has equal or better performance on all supported targets.
+ bool hasFMA() const { return HasFMA && !HasFMA4; }
+ bool hasFMA4() const { return HasFMA4; }
+ bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+ bool hasXOP() const { return HasXOP; }
+ bool hasTBM() const { return HasTBM; }
+ bool hasMOVBE() const { return HasMOVBE; }
+ bool hasRDRAND() const { return HasRDRAND; }
+ bool hasF16C() const { return HasF16C; }
+ bool hasFSGSBase() const { return HasFSGSBase; }
+ bool hasLZCNT() const { return HasLZCNT; }
+ bool hasBMI() const { return HasBMI; }
+ bool hasBMI2() const { return HasBMI2; }
+ bool hasRTM() const { return HasRTM; }
+ bool hasHLE() const { return HasHLE; }
+ bool hasADX() const { return HasADX; }
+ bool hasSHA() const { return HasSHA; }
+ bool hasPRFCHW() const { return HasPRFCHW; }
+ bool hasRDSEED() const { return HasRDSEED; }
+ bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+ bool isBTMemSlow() const { return IsBTMemSlow; }
+ bool isSHLDSlow() const { return IsSHLDSlow; }
+ bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
+ bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
+ bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+ bool useLeaForSP() const { return UseLeaForSP; }
+ bool hasSlowDivide32() const { return HasSlowDivide32; }
+ bool hasSlowDivide64() const { return HasSlowDivide64; }
+ bool padShortFunctions() const { return PadShortFunctions; }
+ bool callRegIndirect() const { return CallRegIndirect; }
+ bool LEAusesAG() const { return LEAUsesAG; }
+ bool slowLEA() const { return SlowLEA; }
+ bool slowIncDec() const { return SlowIncDec; }
+ bool hasCDI() const { return HasCDI; }
+ bool hasPFI() const { return HasPFI; }
+ bool hasERI() const { return HasERI; }
+ bool hasDQI() const { return HasDQI; }
+ bool hasBWI() const { return HasBWI; }
+ bool hasVLX() const { return HasVLX; }
+ bool hasPKU() const { return HasPKU; }
+ bool hasMPX() const { return HasMPX; }
+
+ bool isAtom() const { return X86ProcFamily == IntelAtom; }
+ bool isSLM() const { return X86ProcFamily == IntelSLM; }
+ bool useSoftFloat() const { return UseSoftFloat; }
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
+ bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
+ bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
+ bool isTargetPS4() const { return TargetTriple.isPS4(); }
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+ bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+ bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+ bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+ bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+
+ bool isTargetWindowsMSVC() const {
+ return TargetTriple.isWindowsMSVCEnvironment();
+ }
+
+ bool isTargetKnownWindowsMSVC() const {
+ return TargetTriple.isKnownWindowsMSVCEnvironment();
+ }
+
+ bool isTargetWindowsCoreCLR() const {
+ return TargetTriple.isWindowsCoreCLREnvironment();
+ }
+
+ bool isTargetWindowsCygwin() const {
+ return TargetTriple.isWindowsCygwinEnvironment();
+ }
+
+ bool isTargetWindowsGNU() const {
+ return TargetTriple.isWindowsGNUEnvironment();
+ }
+
+ bool isTargetWindowsItanium() const {
+ return TargetTriple.isWindowsItaniumEnvironment();
+ }
+
+ bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+ bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+ bool isTargetWin64() const {
+ return In64BitMode && TargetTriple.isOSWindows();
+ }
+
+ bool isTargetWin32() const {
+ return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
+ }
+
+ bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+ bool isPICStyleStubPIC() const {
+ return PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isPICStyleStubNoDynamic() const {
+ return PICStyle == PICStyles::StubDynamicNoPIC;
+ }
+ bool isPICStyleStubAny() const {
+ return PICStyle == PICStyles::StubDynamicNoPIC ||
+ PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isCallingConvWin64(CallingConv::ID CC) const {
+ switch (CC) {
+ // On Win64, all these conventions just use the default convention.
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::Intel_OCL_BI:
+ return isTargetWin64();
+ // This convention allows using the Win64 convention on other targets.
+ case CallingConv::X86_64_Win64:
+ return true;
+ // This convention allows using the SysV convention on Windows targets.
+ case CallingConv::X86_64_SysV:
+ return false;
+ // Otherwise, who knows what this is.
+ default:
+ return false;
+ }
+ }
+
+ /// ClassifyGlobalReference - Classify a global variable reference for the
+ /// current subtarget according to how we should reference it in a non-pcrel
+ /// context.
+ unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM)const;
+
+ /// Classify a blockaddress reference for the current subtarget according to
+ /// how we should reference it in a non-pcrel context.
+ unsigned char ClassifyBlockAddressReference() const;
+
+ /// Return true if the subtarget allows calls to immediate address.
+ bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
+
+ /// This function returns true if the target has sincos() routine in its
+ /// compiler runtime or math libraries.
+ bool hasSinCos() const;
+
+ /// Enable the MachineScheduler pass for all X86 subtargets.
+ bool enableMachineScheduler() const override { return true; }
+
+ bool enableEarlyIfConversion() const override;
+
+ /// Return the instruction itineraries based on the subtarget selection.
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ AntiDepBreakMode getAntiDepBreakMode() const override {
+ return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..0e7e4c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,280 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "X86TargetObjectFile.h"
+#include "X86TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeX86Target() {
+ // Register the target.
+ RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
+ RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeWinEHStatePassPass(PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getArch() == Triple::x86_64)
+ return make_unique<X86_64MachoTargetObjectFile>();
+ return make_unique<TargetLoweringObjectFileMachO>();
+ }
+
+ if (TT.isOSLinux() || TT.isOSNaCl())
+ return make_unique<X86LinuxNaClTargetObjectFile>();
+ if (TT.isOSBinFormatELF())
+ return make_unique<X86ELFTargetObjectFile>();
+ if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
+ return make_unique<X86WindowsTargetObjectFile>();
+ if (TT.isOSBinFormatCOFF())
+ return make_unique<TargetLoweringObjectFileCOFF>();
+ llvm_unreachable("unknown subtarget type");
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+ // X86 is little endian
+ std::string Ret = "e";
+
+ Ret += DataLayout::getManglingComponent(TT);
+ // X86 and x32 have 32 bit pointers.
+ if ((TT.isArch64Bit() &&
+ (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+ !TT.isArch64Bit())
+ Ret += "-p:32:32";
+
+ // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+ if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+ Ret += "-i64:64";
+ else
+ Ret += "-f64:32:64";
+
+ // Some ABIs align long double to 128 bits, others to 32.
+ if (TT.isOSNaCl())
+ ; // No f80
+ else if (TT.isArch64Bit() || TT.isOSDarwin())
+ Ret += "-f80:128";
+ else
+ Ret += "-f80:32";
+
+ // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+ if (TT.isArch64Bit())
+ Ret += "-n8:16:32:64";
+ else
+ Ret += "-n8:16:32";
+
+ // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+ if (!TT.isArch64Bit() && TT.isOSWindows())
+ Ret += "-a:0:32-S32";
+ else
+ Ret += "-S128";
+
+ return Ret;
+}
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
+ OL),
+ TLOF(createTLOF(getTargetTriple())),
+ Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
+ // Windows stack unwinder gets confused when execution flow "falls through"
+ // after a call to 'noreturn' function.
+ // To prevent that, we emit a trap for 'unreachable' IR instructions.
+ // (which on X86, happens to be the 'ud2' instruction)
+ if (Subtarget.isTargetWin64())
+ this->Options.TrapUnreachable = true;
+
+ // By default (and when -ffast-math is on), enable estimate codegen for
+ // everything except scalar division. By default, use 1 refinement step for
+ // all operations. Defaults may be overridden by using command-line options.
+ // Scalar division estimates are disabled because they break too much
+ // real-world code. These defaults match GCC behavior.
+ this->Options.Reciprocals.setDefaults("sqrtf", true, 1);
+ this->Options.Reciprocals.setDefaults("divf", false, 1);
+ this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1);
+ this->Options.Reciprocals.setDefaults("vec-divf", true, 1);
+
+ initAsmInfo();
+}
+
+X86TargetMachine::~X86TargetMachine() {}
+
+const X86Subtarget *
+X86TargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.hasFnAttribute("use-soft-float") &&
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
+ Options.StackAlignmentOverride);
+ }
+ return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+ cl::desc("Minimize AVX to SSE transition penalty"),
+ cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// X86 TTI query.
+//===----------------------------------------------------------------------===//
+
+TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(X86TTIImpl(this, F));
+ });
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+ X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ X86TargetMachine &getX86TargetMachine() const {
+ return getTM<X86TargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ bool addILPOpts() override;
+ bool addPreISel() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreEmitPass() override;
+ void addPreSched2() override;
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new X86PassConfig(this, PM);
+}
+
+void X86PassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass(&getX86TargetMachine()));
+
+ TargetPassConfig::addIRPasses();
+}
+
+bool X86PassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+ // For ELF, cleanup any local-dynamic TLS accesses.
+ if (TM->getTargetTriple().isOSBinFormatELF() &&
+ getOptLevel() != CodeGenOpt::None)
+ addPass(createCleanupLocalDynamicTLSPass());
+
+ addPass(createX86GlobalBaseRegPass());
+
+ return false;
+}
+
+bool X86PassConfig::addILPOpts() {
+ addPass(&EarlyIfConverterID);
+ if (EnableMachineCombinerPass)
+ addPass(&MachineCombinerID);
+ return true;
+}
+
+bool X86PassConfig::addPreISel() {
+ // Only add this pass for 32-bit x86 Windows.
+ const Triple &TT = TM->getTargetTriple();
+ if (TT.isOSWindows() && TT.getArch() == Triple::x86)
+ addPass(createX86WinEHStatePass());
+ return true;
+}
+
+void X86PassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createX86OptimizeLEAs());
+
+ addPass(createX86CallFrameOptimization());
+}
+
+void X86PassConfig::addPostRegAlloc() {
+ addPass(createX86FloatingPointStackifierPass());
+}
+
+void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
+
+void X86PassConfig::addPreEmitPass() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
+
+ if (UseVZeroUpper)
+ addPass(createX86IssueVZeroUpperPass());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86PadShortFunctions());
+ addPass(createX86FixupLEAs());
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..2629556
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,49 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine final : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ X86Subtarget Subtarget;
+
+ mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+
+public:
+ X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL);
+ ~X86TargetMachine() override;
+ const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+
+ TargetIRAnalysis getTargetIRAnalysis() override;
+
+ // Set up the pass pipeline.
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..782768d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,175 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+ const TargetMachine &TM, MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const {
+
+ // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+ // is an indirect pc-relative reference.
+ if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Four = MCConstantExpr::create(4, getContext());
+ return MCBinaryExpr::createAdd(Res, Four, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+ GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+ const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+ MachineModuleInfo *MMI) const {
+ return TM.getSymbol(GV, Mang);
+}
+
+const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+ const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
+ // from a data section. In case there's an additional offset, then use
+ // foo@GOTPCREL+4+<offset>.
+ unsigned FinalOff = Offset+MV.getConstant()+4;
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext());
+ return MCBinaryExpr::createAdd(Res, Off, getContext());
+}
+
+const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
+ const MCSymbol *Sym) const {
+ return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
+
+void
+X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
+ const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const {
+ // We are looking for the difference of two symbols, need a subtraction
+ // operation.
+ const SubOperator *Sub = dyn_cast<SubOperator>(CE);
+ if (!Sub)
+ return nullptr;
+
+ // Symbols must first be numbers before we can subtract them, we need to see a
+ // ptrtoint on both subtraction operands.
+ const PtrToIntOperator *SubLHS =
+ dyn_cast<PtrToIntOperator>(Sub->getOperand(0));
+ const PtrToIntOperator *SubRHS =
+ dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
+ if (!SubLHS || !SubRHS)
+ return nullptr;
+
+ // Our symbols should exist in address space zero, cowardly no-op if
+ // otherwise.
+ if (SubLHS->getPointerAddressSpace() != 0 ||
+ SubRHS->getPointerAddressSpace() != 0)
+ return nullptr;
+
+ // Both ptrtoint instructions must wrap global objects:
+ // - Only global variables are eligible for image relative relocations.
+ // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
+ const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand());
+ const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
+ if (!GOLHS || !GVRHS)
+ return nullptr;
+
+ // We expect __ImageBase to be a global variable without a section, externally
+ // defined.
+ //
+ // It should look something like this: @__ImageBase = external constant i8
+ if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
+ !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
+ GVRHS->hasSection())
+ return nullptr;
+
+ // An image-relative, thread-local, symbol makes no sense.
+ if (GOLHS->isThreadLocal())
+ return nullptr;
+
+ return MCSymbolRefExpr::create(TM.getSymbol(GOLHS, Mang),
+ MCSymbolRefExpr::VK_COFF_IMGREL32,
+ getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+ unsigned Width = (AI.getBitWidth() / 8) * 2;
+ std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+ unsigned Size = HexString.size();
+ assert(Width >= Size && "hex string is too large!");
+ HexString.insert(HexString.begin(), Width - Size, '0');
+
+ return HexString;
+}
+
+static std::string scalarConstantToHexString(const Constant *C) {
+ Type *Ty = C->getType();
+ if (isa<UndefValue>(C)) {
+ return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
+ } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
+ return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
+ } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
+ return APIntToHexString(CI->getValue());
+ } else {
+ unsigned NumElements;
+ if (isa<VectorType>(Ty))
+ NumElements = Ty->getVectorNumElements();
+ else
+ NumElements = Ty->getArrayNumElements();
+ std::string HexString;
+ for (int I = NumElements - 1, E = -1; I != E; --I)
+ HexString += scalarConstantToHexString(C->getAggregateElement(I));
+ return HexString;
+ }
+}
+
+MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+ if (Kind.isMergeableConst() && C) {
+ const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+ COFF::IMAGE_SCN_MEM_READ |
+ COFF::IMAGE_SCN_LNK_COMDAT;
+ std::string COMDATSymName;
+ if (Kind.isMergeableConst4() || Kind.isMergeableConst8())
+ COMDATSymName = "__real@" + scalarConstantToHexString(C);
+ else if (Kind.isMergeableConst16())
+ COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+
+ if (!COMDATSymName.empty())
+ return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+ COMDATSymName,
+ COFF::IMAGE_COMDAT_SELECT_ANY);
+ }
+
+ return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..6b2448c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,67 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+ /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+ /// x86-64.
+ class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+ public:
+ const MCExpr *
+ getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+ Mangler &Mang, const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+
+ // getCFIPersonalitySymbol - The symbol that gets passed to
+ // .cfi_personality.
+ MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI) const override;
+
+ const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+ };
+
+ /// \brief This implemenatation is used for X86 ELF targets that don't
+ /// have a further specialization.
+ class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ /// \brief Describe a TLS variable address within debug info.
+ const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+ };
+
+ /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
+ /// Native Client on x86 and x86-64.
+ class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
+ /// \brief This implementation is used for Windows targets on x86 and x86-64.
+ class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
+ const MCExpr *
+ getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
+ const TargetMachine &TM) const override;
+
+ /// \brief Given a mergeable constant with the specified size and relocation
+ /// information, return a section that it should be placed in.
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C) const override;
+ };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 0000000..2e7bbb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,1487 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ // TODO: Currently the __builtin_popcount() implementation using SSE3
+ // instructions is inefficient. Once the problem is fixed, we should
+ // call ST->hasSSE3() instead of ST->hasPOPCNT().
+ return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
+unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+ if (Vector && !ST->hasSSE1())
+ return 0;
+
+ if (ST->is64Bit()) {
+ if (Vector && ST->hasAVX512())
+ return 32;
+ return 16;
+ }
+ return 8;
+}
+
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+ if (Vector) {
+ if (ST->hasAVX512()) return 512;
+ if (ST->hasAVX()) return 256;
+ if (ST->hasSSE1()) return 128;
+ return 0;
+ }
+
+ if (ST->is64Bit())
+ return 64;
+
+ return 32;
+}
+
+unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // If the loop will not be vectorized, don't interleave the loop.
+ // Let regular unroll to unroll the loop, which saves the overflow
+ // check and memory check cost.
+ if (VF == 1)
+ return 1;
+
+ if (ST->isAtom())
+ return 1;
+
+ // Sandybridge and Haswell have multiple execution ports and pipelined
+ // vector units.
+ if (ST->hasAVX())
+ return 4;
+
+ return 2;
+}
+
+int X86TTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ if (ISD == ISD::SDIV &&
+ Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+ // On X86, vector signed division by constants power-of-two are
+ // normally expanded to the sequence SRA + SRL + ADD + SRA.
+ // The OperandValue properties many not be same as that of previous
+ // operation;conservatively assume OP_None.
+ int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
+ Op2Info, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ return Cost;
+ }
+
+ static const CostTblEntry AVX2UniformConstCostTable[] = {
+ { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
+
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512CostTable[] = {
+ { ISD::SHL, MVT::v16i32, 1 },
+ { ISD::SRL, MVT::v16i32, 1 },
+ { ISD::SRA, MVT::v16i32, 1 },
+ { ISD::SHL, MVT::v8i64, 1 },
+ { ISD::SRL, MVT::v8i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+ };
+
+ if (ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2CostTable[] = {
+ // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // customize them to detect the cases where shift amount is a scalar one.
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
+ };
+
+ // Look for AVX2 lowering tricks.
+ if (ST->hasAVX2()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX2, a packed v16i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return LT.first;
+
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry XOPCostTable[] = {
+ // 128bit shifts take 1cy, but right shifts require negation beforehand.
+ { ISD::SHL, MVT::v16i8, 1 },
+ { ISD::SRL, MVT::v16i8, 2 },
+ { ISD::SRA, MVT::v16i8, 2 },
+ { ISD::SHL, MVT::v8i16, 1 },
+ { ISD::SRL, MVT::v8i16, 2 },
+ { ISD::SRA, MVT::v8i16, 2 },
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 2 },
+ { ISD::SRA, MVT::v4i32, 2 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 2 },
+ { ISD::SRA, MVT::v2i64, 2 },
+ // 256bit shifts require splitting if AVX2 didn't catch them above.
+ { ISD::SHL, MVT::v32i8, 2 },
+ { ISD::SRL, MVT::v32i8, 4 },
+ { ISD::SRA, MVT::v32i8, 4 },
+ { ISD::SHL, MVT::v16i16, 2 },
+ { ISD::SRL, MVT::v16i16, 4 },
+ { ISD::SRA, MVT::v16i16, 4 },
+ { ISD::SHL, MVT::v8i32, 2 },
+ { ISD::SRL, MVT::v8i32, 4 },
+ { ISD::SRA, MVT::v8i32, 4 },
+ { ISD::SHL, MVT::v4i64, 2 },
+ { ISD::SRL, MVT::v4i64, 4 },
+ { ISD::SRA, MVT::v4i64, 4 },
+ };
+
+ // Look for XOP lowering tricks.
+ if (ST->hasXOP()) {
+ if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2CustomCostTable[] = {
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry
+ SSE2UniformConstCostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ // Constant splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i8, 1 }, // psllw.
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v16i16, 2 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v8i32, 2 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+ { ISD::SHL, MVT::v4i64, 2 }, // psllq.
+
+ { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v8i32, 2 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+ { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
+
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v16i16, 2 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ { ISD::SRA, MVT::v8i32, 2 }, // psrad.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
+
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
+ if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ if (ISD == ISD::SHL &&
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+ MVT VT = LT.second;
+ // Vector shift left by non uniform constant can be lowered
+ // into vector multiply (pmullw/pmulld).
+ if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+ (VT == MVT::v4i32 && ST->hasSSE41()))
+ return LT.first;
+
+ // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+ // sequence of extract + two vector multiply + insert.
+ if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
+ (ST->hasAVX() && !ST->hasAVX2()))
+ ISD = ISD::MUL;
+
+ // A vector shift left by non uniform constant is converted
+ // into a vector multiply; the new multiply is eventually
+ // lowered into a sequence of shuffles and 2 x pmuludq.
+ if (VT == MVT::v4i32 && ST->hasSSE2())
+ ISD = ISD::MUL;
+ }
+
+ static const CostTblEntry SSE2CostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ // For some cases, where the shift amount is a scalar we would be able
+ // to generate better code. Unfortunately, when this is the case the value
+ // (the splat) will get hoisted out of the loop, thereby making it invisible
+ // to ISel. The cost model must return worst case assumptions because it is
+ // used for vectorization and we don't want to make vectorized code worse
+ // than scalar code.
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
+
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
+
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ { ISD::SDIV, MVT::v16i8, 16*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v16i8, 16*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
+ };
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // split factor of two in the cost table. Therefore, the cost here is 18
+ // instead of 9.
+ { ISD::MUL, MVT::v4i64, 18 },
+ };
+
+ // Look for AVX1 lowering tricks.
+ if (ST->hasAVX() && !ST->hasAVX2()) {
+ MVT VT = LT.second;
+
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+ return LT.first * Entry->Cost;
+ }
+
+ // Custom lowering of vectors.
+ static const CostTblEntry CustomLowered[] = {
+ // A v2i64/v4i64 and multiply is custom lowered as a series of long
+ // multiplies(3), shifts(4) and adds(2).
+ { ISD::MUL, MVT::v2i64, 9 },
+ { ISD::MUL, MVT::v4i64, 9 },
+ };
+ if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+ // 2x pmuludq, 2x shuffle.
+ if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+ !ST->hasSSE41())
+ return LT.first * 6;
+
+ // Fallback to the default implementation.
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+}
+
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ // We only estimate the cost of reverse and alternate shuffles.
+ if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+ if (Kind == TTI::SK_Reverse) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ int Cost = 1;
+ if (LT.second.getSizeInBits() > 128)
+ Cost = 3; // Extract + insert + copy.
+
+ // Multiple by the number of parts.
+ return Cost * LT.first;
+ }
+
+ if (Kind == TTI::SK_Alternate) {
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ // The backend knows how to generate a single VEX.256 version of
+ // instruction VPBLENDW if the target supports AVX2.
+ if (ST->hasAVX2() && LT.second == MVT::v16i16)
+ return LT.first;
+
+ static const CostTblEntry AVXAltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps
+
+ // This shuffle is custom lowered into a sequence of:
+ // 2x vextractf128 , 2x vpblendw , 1x vinsertf128
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+ // This shuffle is custom lowered into a long sequence of:
+ // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE41AltShuffleTbl[] = {
+ // These are lowered into movsd.
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+ // packed float vectors with four elements are lowered into BLENDI dag
+ // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+ // This shuffle generates a single pshufw.
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+ // There is no instruction that matches a v16i8 alternate shuffle.
+ // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+ };
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSSE3AltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+
+ // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+ // the sequence 'shufps + pshufd'
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSEAltShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+
+ // This is expanded into a long sequence of four extract + four insert.
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+ // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+ };
+
+ // Fall-back (SSE3 and SSE2).
+ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // FIXME: Need a better design of the cost table to handle non-simple types of
+ // potential massive combinations (elem_num x src_type x dst_type).
+
+ static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
+ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+
+ // v16i1 -> v16i32 - load + broadcast
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
+
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
+
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },
+
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
+ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ };
+
+ static const TypeConversionCostTblEntry AVXConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
+
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+ // The generic code to compute the scalar overhead is currently broken.
+ // Workaround this limitation by estimating the scalarization overhead
+ // here. We have roughly 10 instructions per scalar element.
+ // Multiply that by the vector width.
+ // FIXME: remove that when PR19268 is fixed.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ // This node is expanded into scalarized operations but BasicTTI is overly
+ // optimistic estimating its cost. It computes 3 per element (one
+ // vector-extract, one scalar conversion and one vector-insert). The
+ // problem is that the inserts form a read-modify-write chain so latency
+ // should be factored in too. Inflating the cost per element by 1.
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
+ };
+
+ static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
+
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 },
+ };
+
+ static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+ // These are somewhat magic numbers justified by looking at the output of
+ // Intel's IACA, running some kernels and making sure when we take
+ // legalization into account the throughput will be overestimated.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ // There are faster sequences for float conversions.
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
+
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
+ };
+
+ std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+ if (ST->hasSSE2() && !ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ LTDest.second, LTSrc.second))
+ return LTSrc.first * Entry->Cost;
+ }
+
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
+
+ // The function getSimpleVT only handles simple value types.
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+ };
+
+ static const CostTblEntry AVX1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f64, 1 },
+ { ISD::SETCC, MVT::v8f32, 1 },
+ // AVX1 does not support 8-wide integer compare.
+ { ISD::SETCC, MVT::v4i64, 4 },
+ { ISD::SETCC, MVT::v8i32, 4 },
+ { ISD::SETCC, MVT::v16i16, 4 },
+ { ISD::SETCC, MVT::v32i8, 4 },
+ };
+
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+ };
+
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::SETCC, MVT::v8i64, 1 },
+ { ISD::SETCC, MVT::v16i32, 1 },
+ { ISD::SETCC, MVT::v8f64, 1 },
+ { ISD::SETCC, MVT::v16f32, 1 },
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // Floating point scalars are already located in index #0.
+ if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+ return 0;
+ }
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+ assert (Ty->isVectorTy() && "Can only scalarize vectors");
+ int Cost = 0;
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ if (Insert)
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (Extract)
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+
+ return Cost;
+}
+
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
+ // Handle non-power-of-two vectors such as <3 x float>
+ if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+ unsigned NumElem = VTy->getVectorNumElements();
+
+ // Handle a few common cases:
+ // <3 x float>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+ // Cost = 64 bit store + extract + 32 bit store.
+ return 3;
+
+ // <3 x double>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+ // Cost = 128 bit store + unpack + 64 bit store.
+ return 3;
+
+ // Assume that all other non-power-of-two numbers are scalarized.
+ if (!isPowerOf2_32(NumElem)) {
+ int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+ AddressSpace);
+ int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+ Opcode == Instruction::Store);
+ return NumElem * Cost + SplitCost;
+ }
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ // Each load/store unit costs 1.
+ int Cost = LT.first * 1;
+
+ // On Sandybridge 256bit load/stores are double pumped
+ // (but not on Haswell).
+ if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
+ Cost*=2;
+
+ return Cost;
+}
+
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+ if (!SrcVTy)
+ // To calculate scalar take the regular cost, without mask
+ return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+
+ unsigned NumElem = SrcVTy->getVectorNumElements();
+ VectorType *MaskTy =
+ VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
+ if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
+ !isPowerOf2_32(NumElem)) {
+ // Scalarization
+ int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+ int ValueSplitCost = getScalarizationOverhead(
+ SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+ int MemopCost =
+ NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+ return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ auto VT = TLI->getValueType(DL, SrcVTy);
+ int Cost = 0;
+ if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+ LT.second.getVectorNumElements() == NumElem)
+ // Promotion requires expand/truncate for data and a shuffle for mask.
+ Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+
+ else if (LT.second.getVectorNumElements() > NumElem) {
+ VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
+ LT.second.getVectorNumElements());
+ // Expanding requires fill mask with zeroes
+ Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+ }
+ if (!ST->hasAVX512())
+ return Cost + LT.first*4; // Each maskmov costs 4
+
+ // AVX-512 masked load/store is cheapper
+ return Cost+LT.first;
+}
+
+int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ return BaseT::getAddressComputationCost(Ty, IsComplex);
+}
+
+int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+ bool IsPairwise) {
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SSE42CostTblPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v8i16, 5 },
+ };
+
+ static const CostTblEntry AVX1CostTblPairWise[] = {
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::FADD, MVT::v4f64, 5 },
+ { ISD::FADD, MVT::v8f32, 7 },
+ { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
+ { ISD::ADD, MVT::v8i16, 5 },
+ { ISD::ADD, MVT::v8i32, 5 },
+ };
+
+ static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
+ { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v4f32, 3 },
+ { ISD::FADD, MVT::v4f64, 3 },
+ { ISD::FADD, MVT::v8f32, 4 },
+ { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
+ { ISD::ADD, MVT::v4i64, 3 },
+ { ISD::ADD, MVT::v8i16, 4 },
+ { ISD::ADD, MVT::v8i32, 5 },
+ };
+
+ if (IsPairwise) {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+ } else {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int X86TTIImpl::getIntImmCost(int64_t Val) {
+ if (Val == 0)
+ return TTI::TCC_Free;
+
+ if (isInt<32>(Val))
+ return TTI::TCC_Basic;
+
+ return 2 * TTI::TCC_Basic;
+}
+
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ // Never hoist constants larger than 128bit, because this might lead to
+ // incorrect code generation or assertions in codegen.
+ // Fixme: Create a cost model for types larger than i128 once the codegen
+ // issues have been fixed.
+ if (BitSize > 128)
+ return TTI::TCC_Free;
+
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ // Sign-extend all constants to a multiple of 64-bit.
+ APInt ImmVal = Imm;
+ if (BitSize & 0x3f)
+ ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+ // Split the constant into 64-bit chunks and calculate the cost for each
+ // chunk.
+ int Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+ int64_t Val = Tmp.getSExtValue();
+ Cost += getIntImmCost(Val);
+ }
+ // We need at least one instruction to materialze the constant.
+ return std::max(1, Cost);
+}
+
+int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ unsigned ImmIdx = ~0U;
+ switch (Opcode) {
+ default:
+ return TTI::TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr. This prevents the
+ // creation of new constants for every base constant that gets constant
+ // folded with the offset.
+ if (Idx == 0)
+ return 2 * TTI::TCC_Basic;
+ return TTI::TCC_Free;
+ case Instruction::Store:
+ ImmIdx = 0;
+ break;
+ case Instruction::ICmp:
+ // This is an imperfect hack to prevent constant hoisting of
+ // compares that might be trying to check if a 64-bit value fits in
+ // 32-bits. The backend can optimize these cases using a right shift by 32.
+ // Ideally we would check the compare predicate here. There also other
+ // similar immediates the backend can use shifts for.
+ if (Idx == 1 && Imm.getBitWidth() == 64) {
+ uint64_t ImmVal = Imm.getZExtValue();
+ if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+ return TTI::TCC_Free;
+ }
+ ImmIdx = 1;
+ break;
+ case Instruction::And:
+ // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+ // by using a 32-bit operation with implicit zero extension. Detect such
+ // immediates here as the normal path expects bit 31 to be sign extended.
+ if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ // Fallthrough
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::Or:
+ case Instruction::Xor:
+ ImmIdx = 1;
+ break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TTI::TCC_Free;
+ break;
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ case Instruction::BitCast:
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ break;
+ }
+
+ if (Idx == ImmIdx) {
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TTI::TCC_Basic)
+ ? static_cast<int>(TTI::TCC_Free)
+ : Cost;
+ }
+
+ return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ switch (IID) {
+ default:
+ return TTI::TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ }
+ return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace) {
+
+ assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ // Try to reduce index size from 64 bit (default for GEP)
+ // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+ // operation will use 16 x 64 indices which do not fit in a zmm and needs
+ // to split. Also check that the base pointer is the same for all lanes,
+ // and that there's at most one variable index.
+ auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+ unsigned IndexSize = DL.getPointerSizeInBits();
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (IndexSize < 64 || !GEP)
+ return IndexSize;
+
+ unsigned NumOfVarIndices = 0;
+ Value *Ptrs = GEP->getPointerOperand();
+ if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+ return IndexSize;
+ for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+ if (isa<Constant>(GEP->getOperand(i)))
+ continue;
+ Type *IndxTy = GEP->getOperand(i)->getType();
+ if (IndxTy->isVectorTy())
+ IndxTy = IndxTy->getVectorElementType();
+ if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+ !isa<SExtInst>(GEP->getOperand(i))) ||
+ ++NumOfVarIndices > 1)
+ return IndexSize; // 64
+ }
+ return (unsigned)32;
+ };
+
+
+ // Trying to reduce IndexSize to 32 bits for vector 16.
+ // By default the IndexSize is equal to pointer size.
+ unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+ DL.getPointerSizeInBits();
+
+ Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
+ IndexSize), VF);
+ std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ if (SplitFactor > 1) {
+ // Handle splitting of vector of pointers
+ Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+ AddressSpace);
+ }
+
+ // The gather / scatter cost is given by Intel architects. It is a rough
+ // number since we are looking at one instruction in a time.
+ const int GSOverhead = 2;
+ return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, unsigned Alignment,
+ unsigned AddressSpace) {
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ int MaskUnpackCost = 0;
+ if (VariableMask) {
+ VectorType *MaskTy =
+ VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
+ MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost =
+ getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
+ nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+ }
+
+ // The cost of the scalar loads/stores.
+ int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+
+ int InsertExtractCost = 0;
+ if (Opcode == Instruction::Load)
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of inserting each scalar load into the vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+ else
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of extracting each element out of the data vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+ return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) {
+ assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+ unsigned VF = SrcVTy->getVectorNumElements();
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ if (!PtrTy && Ptr->getType()->isVectorTy())
+ PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+ assert(PtrTy && "Unexpected type for Ptr argument");
+ unsigned AddressSpace = PtrTy->getAddressSpace();
+
+ bool Scalarize = false;
+ if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+ Scalarize = true;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
+ // better in the VariableMask case.
+ if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+ Scalarize = true;
+
+ if (Scalarize)
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+
+ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+ Type *ScalarTy = DataTy->getScalarType();
+ int DataWidth = isa<PointerType>(ScalarTy) ?
+ DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+ return (DataWidth >= 32 && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
+ return isLegalMaskedLoad(DataType);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+ // This function is called now in two cases: from the Loop Vectorizer
+ // and from the Scalarizer.
+ // When the Loop Vectorizer asks about legality of the feature,
+ // the vectorization factor is not calculated yet. The Loop Vectorizer
+ // sends a scalar type and the decision is based on the width of the
+ // scalar element.
+ // Later on, the cost model will estimate usage this intrinsic based on
+ // the vector type.
+ // The Scalarizer asks again about legality. It sends a vector type.
+ // In this case we can reject non-power-of-2 vectors.
+ if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
+ return false;
+ Type *ScalarTy = DataTy->getScalarType();
+ int DataWidth = isa<PointerType>(ScalarTy) ?
+ DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+ // AVX-512 allows gather and scatter
+ return DataWidth >= 32 && ST->hasAVX512();
+}
+
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+ return isLegalMaskedGather(DataType);
+}
+
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ // Work this as a subsetting of subtarget features.
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // FIXME: This is likely too limiting as it will include subtarget features
+ // that we might not care about for inlining, but it is conservatively
+ // correct.
+ return (CallerBits & CalleeBits) == CalleeBits;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
new file mode 100644
index 0000000..adb745e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -0,0 +1,109 @@
+//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// X86 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+ typedef BasicTTIImplBase<X86TTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const X86Subtarget *ST;
+ const X86TargetLowering *TLI;
+
+ int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+ const X86Subtarget *getST() const { return ST; }
+ const X86TargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ // Provide value semantics. MSVC requires that we spell all of these out.
+ X86TTIImpl(const X86TTIImpl &Arg)
+ : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+ X86TTIImpl(X86TTIImpl &&Arg)
+ : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+ TLI(std::move(Arg.TLI)) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask, unsigned Alignment);
+ int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+ int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+ int getIntImmCost(int64_t);
+
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+ bool isLegalMaskedLoad(Type *DataType);
+ bool isLegalMaskedStore(Type *DataType);
+ bool isLegalMaskedGather(Type *DataType);
+ bool isLegalMaskedScatter(Type *DataType);
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
+private:
+ int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+ unsigned Alignment, unsigned AddressSpace);
+ int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace);
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000..6925b27
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,320 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when transferring control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-vzeroupper"
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+
+ class VZeroUpperInserter : public MachineFunctionPass {
+ public:
+
+ VZeroUpperInserter() : MachineFunctionPass(ID) {}
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ const char *getPassName() const override {return "X86 vzeroupper inserter";}
+
+ private:
+
+ void processBasicBlock(MachineBasicBlock &MBB);
+ void insertVZeroUpper(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB);
+ void addDirtySuccessor(MachineBasicBlock &MBB);
+
+ typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+ static const char* getBlockExitStateName(BlockExitState ST);
+
+ // Core algorithm state:
+ // BlockState - Each block is either:
+ // - PASS_THROUGH: There are neither YMM dirtying instructions nor
+ // vzeroupper instructions in this block.
+ // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+ // block that will ensure that YMM is clean on exit.
+ // - EXITS_DIRTY: An instruction in the block dirties YMM and no
+ // subsequent vzeroupper in the block clears it.
+ //
+ // AddedToDirtySuccessors - This flag is raised when a block is added to the
+ // DirtySuccessors list to ensure that it's not
+ // added multiple times.
+ //
+ // FirstUnguardedCall - Records the location of the first unguarded call in
+ // each basic block that may need to be guarded by a
+ // vzeroupper. We won't know whether it actually needs
+ // to be guarded until we discover a predecessor that
+ // is DIRTY_OUT.
+ struct BlockState {
+ BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
+ BlockExitState ExitState;
+ bool AddedToDirtySuccessors;
+ MachineBasicBlock::iterator FirstUnguardedCall;
+ };
+ typedef SmallVector<BlockState, 8> BlockStateMap;
+ typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
+
+ BlockStateMap BlockStates;
+ DirtySuccessorsWorkList DirtySuccessors;
+ bool EverMadeChange;
+ const TargetInstrInfo *TII;
+
+ static char ID;
+ };
+
+ char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+ return new VZeroUpperInserter();
+}
+
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+ switch (ST) {
+ case PASS_THROUGH: return "Pass-through";
+ case EXITS_DIRTY: return "Exits-dirty";
+ case EXITS_CLEAN: return "Exits-clean";
+ }
+ llvm_unreachable("Invalid block exit state.");
+}
+
+static bool isYmmReg(unsigned Reg) {
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+ for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+ E = MRI.livein_end(); I != E; ++I)
+ if (isYmmReg(I->first))
+ return true;
+
+ return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ return true;
+}
+
+static bool hasYmmReg(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+ return true;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDebug())
+ continue;
+ if (isYmmReg(MO.getReg()))
+ return true;
+ }
+ return false;
+}
+
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool callClobbersAnyYmmReg(MachineInstr *MI) {
+ assert(MI->isCall() && "Can only be called on call instructions.");
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isRegMask())
+ continue;
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+ if (MO.clobbersPhysReg(reg))
+ return true;
+ }
+ }
+ return false;
+}
+
+// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB) {
+ DebugLoc dl = I->getDebugLoc();
+ BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+ ++NumVZU;
+ EverMadeChange = true;
+}
+
+// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+ if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+ DirtySuccessors.push_back(&MBB);
+ BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
+ }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzeroupper instructions before function calls.
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
+
+ // Start by assuming that the block PASS_THROUGH, which implies no unguarded
+ // calls.
+ BlockExitState CurState = PASS_THROUGH;
+ BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ MachineInstr *MI = I;
+ bool isControlFlow = MI->isCall() || MI->isReturn();
+
+ // Shortcut: don't need to check regular instructions in dirty state.
+ if (!isControlFlow && CurState == EXITS_DIRTY)
+ continue;
+
+ if (hasYmmReg(MI)) {
+ // We found a ymm-using instruction; this could be an AVX instruction,
+ // or it could be control flow.
+ CurState = EXITS_DIRTY;
+ continue;
+ }
+
+ // Check for control-flow out of the current function (which might
+ // indirectly execute SSE instructions).
+ if (!isControlFlow)
+ continue;
+
+ // If the call won't clobber any YMM register, skip it as well. It usually
+ // happens on helper function calls (such as '_chkstk', '_ftol2') where
+ // standard calling convention is not used (RegMask is not used to mark
+ // register clobbered and register usage (def/imp-def/use) is well-defined
+ // and explicitly specified.
+ if (MI->isCall() && !callClobbersAnyYmmReg(MI))
+ continue;
+
+ // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+ // registers. This instruction has zero latency. In addition, the processor
+ // changes back to Clean state, after which execution of Intel SSE
+ // instructions or Intel AVX instructions has no transition penalty. Add
+ // the VZEROUPPER instruction before any function call/return that might
+ // execute SSE code.
+ // FIXME: In some cases, we may want to move the VZEROUPPER into a
+ // predecessor block.
+ if (CurState == EXITS_DIRTY) {
+ // After the inserted VZEROUPPER the state becomes clean again, but
+ // other YMM may appear before other subsequent calls or even before
+ // the end of the BB.
+ insertVZeroUpper(I, MBB);
+ CurState = EXITS_CLEAN;
+ } else if (CurState == PASS_THROUGH) {
+ // If this block is currently in pass-through state and we encounter a
+ // call then whether we need a vzeroupper or not depends on whether this
+ // block has successors that exit dirty. Record the location of the call,
+ // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+ // It will be inserted later if necessary.
+ BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
+ CurState = EXITS_CLEAN;
+ }
+ }
+
+ DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+ << getBlockExitStateName(CurState) << '\n');
+
+ if (CurState == EXITS_DIRTY)
+ for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+ SE = MBB.succ_end();
+ SI != SE; ++SI)
+ addDirtySuccessor(**SI);
+
+ BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzeroupper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ if (!ST.hasAVX() || ST.hasAVX512())
+ return false;
+ TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ EverMadeChange = false;
+
+ bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+ // Fast check: if the function doesn't use any ymm registers, we don't need
+ // to insert any VZEROUPPER instructions. This is constant-time, so it is
+ // cheap in the common case of no ymm use.
+ bool YMMUsed = FnHasLiveInYmm;
+ if (!YMMUsed) {
+ const TargetRegisterClass *RC = &X86::VR256RegClass;
+ for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+ i++) {
+ if (!MRI.reg_nodbg_empty(*i)) {
+ YMMUsed = true;
+ break;
+ }
+ }
+ }
+ if (!YMMUsed) {
+ return false;
+ }
+
+ assert(BlockStates.empty() && DirtySuccessors.empty() &&
+ "X86VZeroUpper state should be clear");
+ BlockStates.resize(MF.getNumBlockIDs());
+
+ // Process all blocks. This will compute block exit states, record the first
+ // unguarded call in each block, and add successors of dirty blocks to the
+ // DirtySuccessors list.
+ for (MachineBasicBlock &MBB : MF)
+ processBasicBlock(MBB);
+
+ // If any YMM regs are live in to this function, add the entry block to the
+ // DirtySuccessors list
+ if (FnHasLiveInYmm)
+ addDirtySuccessor(MF.front());
+
+ // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
+ // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+ // through PASS_THROUGH blocks.
+ while (!DirtySuccessors.empty()) {
+ MachineBasicBlock &MBB = *DirtySuccessors.back();
+ DirtySuccessors.pop_back();
+ BlockState &BBState = BlockStates[MBB.getNumber()];
+
+ // MBB is a successor of a dirty block, so its first call needs to be
+ // guarded.
+ if (BBState.FirstUnguardedCall != MBB.end())
+ insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+ // If this successor was a pass-through block then it is now dirty, and its
+ // successors need to be added to the worklist (if they haven't been
+ // already).
+ if (BBState.ExitState == PASS_THROUGH) {
+ DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+ << " was Pass-through, is now Dirty-out.\n");
+ for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+ SE = MBB.succ_end();
+ SI != SE; ++SI)
+ addDirtySuccessor(**SI);
+ }
+ }
+
+ BlockStates.clear();
+ return EverMadeChange;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
new file mode 100644
index 0000000..dce94a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -0,0 +1,456 @@
+//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// All functions using an MSVC EH personality use an explicitly updated state
+// number stored in an exception registration stack object. The registration
+// object is linked into a thread-local chain of registrations stored at fs:00.
+// This pass adds the registration object and EH state updates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "winehstate"
+
+namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); }
+
+namespace {
+class WinEHStatePass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid.
+
+ WinEHStatePass() : FunctionPass(ID) {
+ initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &Fn) override;
+
+ bool doInitialization(Module &M) override;
+
+ bool doFinalization(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ const char *getPassName() const override {
+ return "Windows 32-bit x86 EH state insertion";
+ }
+
+private:
+ void emitExceptionRegistrationRecord(Function *F);
+
+ void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
+ void unlinkExceptionRegistration(IRBuilder<> &Builder);
+ void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
+ void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State);
+
+ Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
+
+ Function *generateLSDAInEAXThunk(Function *ParentFunc);
+
+ // Module-level type getters.
+ Type *getEHLinkRegistrationType();
+ Type *getSEHRegistrationType();
+ Type *getCXXEHRegistrationType();
+
+ // Per-module data.
+ Module *TheModule = nullptr;
+ StructType *EHLinkRegistrationTy = nullptr;
+ StructType *CXXEHRegistrationTy = nullptr;
+ StructType *SEHRegistrationTy = nullptr;
+ Function *FrameRecover = nullptr;
+ Function *FrameAddress = nullptr;
+ Function *FrameEscape = nullptr;
+
+ // Per-function state
+ EHPersonality Personality = EHPersonality::Unknown;
+ Function *PersonalityFn = nullptr;
+
+ /// The stack allocation containing all EH data, including the link in the
+ /// fs:00 chain and the current state.
+ AllocaInst *RegNode = nullptr;
+
+ /// Struct type of RegNode. Used for GEPing.
+ Type *RegNodeTy = nullptr;
+
+ /// The index of the state field of RegNode.
+ int StateFieldIndex = ~0U;
+
+ /// The linked list node subobject inside of RegNode.
+ Value *Link = nullptr;
+};
+}
+
+FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
+
+char WinEHStatePass::ID = 0;
+
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+ "Insert stores for EH state numbers", false, false)
+
+bool WinEHStatePass::doInitialization(Module &M) {
+ TheModule = &M;
+ FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
+ FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover);
+ FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress);
+ return false;
+}
+
+bool WinEHStatePass::doFinalization(Module &M) {
+ assert(TheModule == &M);
+ TheModule = nullptr;
+ EHLinkRegistrationTy = nullptr;
+ CXXEHRegistrationTy = nullptr;
+ SEHRegistrationTy = nullptr;
+ FrameEscape = nullptr;
+ FrameRecover = nullptr;
+ FrameAddress = nullptr;
+ return false;
+}
+
+void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
+ // This pass should only insert a stack allocation, memory accesses, and
+ // localrecovers.
+ AU.setPreservesCFG();
+}
+
+bool WinEHStatePass::runOnFunction(Function &F) {
+ // Check the personality. Do nothing if this personality doesn't use funclets.
+ if (!F.hasPersonalityFn())
+ return false;
+ PersonalityFn =
+ dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+ if (!PersonalityFn)
+ return false;
+ Personality = classifyEHPersonality(PersonalityFn);
+ if (!isFuncletEHPersonality(Personality))
+ return false;
+
+ // Skip this function if there are no EH pads and we aren't using IR-level
+ // outlining.
+ bool HasPads = false;
+ for (BasicBlock &BB : F) {
+ if (BB.isEHPad()) {
+ HasPads = true;
+ break;
+ }
+ }
+ if (!HasPads)
+ return false;
+
+ // Disable frame pointer elimination in this function.
+ // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
+ // use an arbitrary register?
+ F.addFnAttr("no-frame-pointer-elim", "true");
+
+ emitExceptionRegistrationRecord(&F);
+
+ // The state numbers calculated here in IR must agree with what we calculate
+ // later on for the MachineFunction. In particular, if an IR pass deletes an
+ // unreachable EH pad after this point before machine CFG construction, we
+ // will be in trouble. If this assumption is ever broken, we should turn the
+ // numbers into an immutable analysis pass.
+ WinEHFuncInfo FuncInfo;
+ addStateStores(F, FuncInfo);
+
+ // Reset per-function state.
+ PersonalityFn = nullptr;
+ Personality = EHPersonality::Unknown;
+ return true;
+}
+
+/// Get the common EH registration subobject:
+/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// struct EHRegistrationNode {
+/// EHRegistrationNode *Next;
+/// PEXCEPTION_ROUTINE Handler;
+/// };
+Type *WinEHStatePass::getEHLinkRegistrationType() {
+ if (EHLinkRegistrationTy)
+ return EHLinkRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode");
+ Type *FieldTys[] = {
+ EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
+ Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...)
+ };
+ EHLinkRegistrationTy->setBody(FieldTys, false);
+ return EHLinkRegistrationTy;
+}
+
+/// The __CxxFrameHandler3 registration node:
+/// struct CXXExceptionRegistration {
+/// void *SavedESP;
+/// EHRegistrationNode SubRecord;
+/// int32_t TryLevel;
+/// };
+Type *WinEHStatePass::getCXXEHRegistrationType() {
+ if (CXXEHRegistrationTy)
+ return CXXEHRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ Type *FieldTys[] = {
+ Type::getInt8PtrTy(Context), // void *SavedESP
+ getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+ Type::getInt32Ty(Context) // int32_t TryLevel
+ };
+ CXXEHRegistrationTy =
+ StructType::create(FieldTys, "CXXExceptionRegistration");
+ return CXXEHRegistrationTy;
+}
+
+/// The _except_handler3/4 registration node:
+/// struct EH4ExceptionRegistration {
+/// void *SavedESP;
+/// _EXCEPTION_POINTERS *ExceptionPointers;
+/// EHRegistrationNode SubRecord;
+/// int32_t EncodedScopeTable;
+/// int32_t TryLevel;
+/// };
+Type *WinEHStatePass::getSEHRegistrationType() {
+ if (SEHRegistrationTy)
+ return SEHRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ Type *FieldTys[] = {
+ Type::getInt8PtrTy(Context), // void *SavedESP
+ Type::getInt8PtrTy(Context), // void *ExceptionPointers
+ getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+ Type::getInt32Ty(Context), // int32_t EncodedScopeTable
+ Type::getInt32Ty(Context) // int32_t TryLevel
+ };
+ SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration");
+ return SEHRegistrationTy;
+}
+
+// Emit an exception registration record. These are stack allocations with the
+// common subobject of two pointers: the previous registration record (the old
+// fs:00) and the personality function for the current frame. The data before
+// and after that is personality function specific.
+void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
+ assert(Personality == EHPersonality::MSVC_CXX ||
+ Personality == EHPersonality::MSVC_X86SEH);
+
+ StringRef PersonalityName = PersonalityFn->getName();
+ IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
+ Type *Int8PtrType = Builder.getInt8PtrTy();
+ if (Personality == EHPersonality::MSVC_CXX) {
+ RegNodeTy = getCXXEHRegistrationType();
+ RegNode = Builder.CreateAlloca(RegNodeTy);
+ // SavedESP = llvm.stacksave()
+ Value *SP = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+ Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+ // TryLevel = -1
+ StateFieldIndex = 2;
+ insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1);
+ // Handler = __ehhandler$F
+ Function *Trampoline = generateLSDAInEAXThunk(F);
+ Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
+ linkExceptionRegistration(Builder, Trampoline);
+ } else if (Personality == EHPersonality::MSVC_X86SEH) {
+ // If _except_handler4 is in use, some additional guard checks and prologue
+ // stuff is required.
+ bool UseStackGuard = (PersonalityName == "_except_handler4");
+ RegNodeTy = getSEHRegistrationType();
+ RegNode = Builder.CreateAlloca(RegNodeTy);
+ // SavedESP = llvm.stacksave()
+ Value *SP = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+ Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+ // TryLevel = -2 / -1
+ StateFieldIndex = 4;
+ insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(),
+ UseStackGuard ? -2 : -1);
+ // ScopeTable = llvm.x86.seh.lsda(F)
+ Value *FI8 = Builder.CreateBitCast(F, Int8PtrType);
+ Value *LSDA = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+ Type *Int32Ty = Type::getInt32Ty(TheModule->getContext());
+ LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
+ // If using _except_handler4, xor the address of the table with
+ // __security_cookie.
+ if (UseStackGuard) {
+ Value *Cookie =
+ TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+ Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+ LSDA = Builder.CreateXor(LSDA, Val);
+ }
+ Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+ Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
+ linkExceptionRegistration(Builder, PersonalityFn);
+ } else {
+ llvm_unreachable("unexpected personality function");
+ }
+
+ // Insert an unlink before all returns.
+ for (BasicBlock &BB : *F) {
+ TerminatorInst *T = BB.getTerminator();
+ if (!isa<ReturnInst>(T))
+ continue;
+ Builder.SetInsertPoint(T);
+ unlinkExceptionRegistration(Builder);
+ }
+}
+
+Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
+ Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext()));
+ return Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+}
+
+/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
+/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE:
+/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// We essentially want this code:
+/// movl $lsda, %eax
+/// jmpl ___CxxFrameHandler3
+Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
+ LLVMContext &Context = ParentFunc->getContext();
+ Type *Int32Ty = Type::getInt32Ty(Context);
+ Type *Int8PtrType = Type::getInt8PtrTy(Context);
+ Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType,
+ Int8PtrType};
+ FunctionType *TrampolineTy =
+ FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4),
+ /*isVarArg=*/false);
+ FunctionType *TargetFuncTy =
+ FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
+ /*isVarArg=*/false);
+ Function *Trampoline =
+ Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+ Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+ ParentFunc->getName()),
+ TheModule);
+ BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
+ IRBuilder<> Builder(EntryBB);
+ Value *LSDA = emitEHLSDA(Builder, ParentFunc);
+ Value *CastPersonality =
+ Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
+ auto AI = Trampoline->arg_begin();
+ Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
+ CallInst *Call = Builder.CreateCall(CastPersonality, Args);
+ // Can't use musttail due to prototype mismatch, but we can use tail.
+ Call->setTailCall(true);
+ // Set inreg so we pass it in EAX.
+ Call->addAttribute(1, Attribute::InReg);
+ Builder.CreateRet(Call);
+ return Trampoline;
+}
+
+void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
+ Function *Handler) {
+ // Emit the .safeseh directive for this function.
+ Handler->addFnAttr("safeseh");
+
+ Type *LinkTy = getEHLinkRegistrationType();
+ // Handler = Handler
+ Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy());
+ Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1));
+ // Next = [fs:00]
+ Constant *FSZero =
+ Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+ Value *Next = Builder.CreateLoad(FSZero);
+ Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
+ // [fs:00] = Link
+ Builder.CreateStore(Link, FSZero);
+}
+
+void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
+ // Clone Link into the current BB for better address mode folding.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) {
+ GEP = cast<GetElementPtrInst>(GEP->clone());
+ Builder.Insert(GEP);
+ Link = GEP;
+ }
+ Type *LinkTy = getEHLinkRegistrationType();
+ // [fs:00] = Link->Next
+ Value *Next =
+ Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
+ Constant *FSZero =
+ Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+ Builder.CreateStore(Next, FSZero);
+}
+
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+ // Mark the registration node. The backend needs to know which alloca it is so
+ // that it can recover the original frame pointer.
+ IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator()));
+ Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+ {RegNodeI8});
+
+ // Calculate state numbers.
+ if (isAsynchronousEHPersonality(Personality))
+ calculateSEHStateNumbers(&F, FuncInfo);
+ else
+ calculateWinCXXEHStateNumbers(&F, FuncInfo);
+
+ // Iterate all the instructions and emit state number stores.
+ DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
+ for (BasicBlock &BB : F) {
+ // Figure out what state we should assign calls in this block.
+ int BaseState = -1;
+ auto &BBColors = BlockColors[&BB];
+
+ assert(BBColors.size() == 1 &&
+ "multi-color BB not removed by preparation");
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ if (auto *FuncletPad =
+ dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+ auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+ if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+ BaseState = BaseStateI->second;
+ }
+
+ for (Instruction &I : BB) {
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ // Possibly throwing call instructions have no actions to take after
+ // an unwind. Ensure they are in the -1 state.
+ if (CI->doesNotThrow())
+ continue;
+ insertStateNumberStore(RegNode, CI, BaseState);
+ } else if (auto *II = dyn_cast<InvokeInst>(&I)) {
+ // Look up the state number of the landingpad this unwinds to.
+ assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+ int State = FuncInfo.InvokeStateMap[II];
+ insertStateNumberStore(RegNode, II, State);
+ }
+ }
+ }
+}
+
+void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode,
+ Instruction *IP, int State) {
+ IRBuilder<> Builder(IP);
+ Value *StateField =
+ Builder.CreateStructGEP(RegNodeTy, ParentRegNode, StateFieldIndex);
+ Builder.CreateStore(Builder.getInt32(State), StateField);
+}
OpenPOWER on IntegriCloud