summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp2518
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp814
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h131
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c1676
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h588
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h398
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp218
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h87
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp511
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h25
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp208
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h96
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp476
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h643
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp227
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h33
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp158
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h58
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp1239
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp442
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h114
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp586
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp79
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp217
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h71
-rw-r--r--contrib/llvm/lib/Target/X86/X86.h80
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td357
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp755
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.h79
-rw-r--r--contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp19
-rw-r--r--contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h46
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.td537
-rw-r--r--contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp1507
-rw-r--r--contrib/llvm/lib/Target/X86/X86FastISel.cpp2427
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp253
-rw-r--r--contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp1768
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.cpp1835
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.h103
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp2720
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp18598
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.h957
-rw-r--r--contrib/llvm/lib/Target/X86/X86Instr3DNow.td103
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrArithmetic.td1361
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrBuilder.h183
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td111
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCompiler.td1839
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrControl.td287
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrExtension.td186
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA.td368
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFPStack.td692
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFormats.td660
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td432
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp4896
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.h419
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.td2197
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMMX.td624
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td8379
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSVM.td62
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td978
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSystem.td529
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrTSX.td39
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrVMX.td66
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrXOP.td311
-rw-r--r--contrib/llvm/lib/Target/X86/X86JITInfo.cpp581
-rw-r--r--contrib/llvm/lib/Target/X86/X86JITInfo.h81
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCInstLower.cpp794
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp14
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h145
-rw-r--r--contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp212
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp691
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.h143
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.td423
-rw-r--r--contrib/llvm/lib/Target/X86/X86Relocations.h52
-rw-r--r--contrib/llvm/lib/Target/X86/X86SchedHaswell.td126
-rw-r--r--contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td122
-rw-r--r--contrib/llvm/lib/Target/X86/X86Schedule.td573
-rw-r--r--contrib/llvm/lib/Target/X86/X86ScheduleAtom.td530
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp270
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h56
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.cpp518
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h383
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.cpp232
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.h137
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp49
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.h43
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp530
-rw-r--r--contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp293
88 files changed, 75397 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..68908ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,2518 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+struct X86Operand;
+
+static const char OpPrecedence[] = {
+ 0, // IC_PLUS
+ 0, // IC_MINUS
+ 1, // IC_MULTIPLY
+ 1, // IC_DIVIDE
+ 2, // IC_RPAREN
+ 3, // IC_LPAREN
+ 0, // IC_IMM
+ 0 // IC_REGISTER
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+ MCSubtargetInfo &STI;
+ MCAsmParser &Parser;
+ ParseInstructionInfo *InstInfo;
+private:
+ enum InfixCalculatorTok {
+ IC_PLUS = 0,
+ IC_MINUS,
+ IC_MULTIPLY,
+ IC_DIVIDE,
+ IC_RPAREN,
+ IC_LPAREN,
+ IC_IMM,
+ IC_REGISTER
+ };
+
+ class InfixCalculator {
+ typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+ SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+ SmallVector<ICToken, 4> PostfixStack;
+
+ public:
+ int64_t popOperand() {
+ assert (!PostfixStack.empty() && "Poped an empty stack!");
+ ICToken Op = PostfixStack.pop_back_val();
+ assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+ && "Expected and immediate or register!");
+ return Op.second;
+ }
+ void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+ assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+ "Unexpected operand!");
+ PostfixStack.push_back(std::make_pair(Op, Val));
+ }
+
+ void popOperator() { InfixOperatorStack.pop_back_val(); }
+ void pushOperator(InfixCalculatorTok Op) {
+ // Push the new operator if the stack is empty.
+ if (InfixOperatorStack.empty()) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // Push the new operator if it has a higher precedence than the operator
+ // on the top of the stack or the operator on the top of the stack is a
+ // left parentheses.
+ unsigned Idx = InfixOperatorStack.size() - 1;
+ InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+ if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // The operator on the top of the stack has higher precedence than the
+ // new operator.
+ unsigned ParenCount = 0;
+ while (1) {
+ // Nothing to process.
+ if (InfixOperatorStack.empty())
+ break;
+
+ Idx = InfixOperatorStack.size() - 1;
+ StackOp = InfixOperatorStack[Idx];
+ if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+ break;
+
+ // If we have an even parentheses count and we see a left parentheses,
+ // then stop processing.
+ if (!ParenCount && StackOp == IC_LPAREN)
+ break;
+
+ if (StackOp == IC_RPAREN) {
+ ++ParenCount;
+ InfixOperatorStack.pop_back_val();
+ } else if (StackOp == IC_LPAREN) {
+ --ParenCount;
+ InfixOperatorStack.pop_back_val();
+ } else {
+ InfixOperatorStack.pop_back_val();
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+ }
+ // Push the new operator.
+ InfixOperatorStack.push_back(Op);
+ }
+ int64_t execute() {
+ // Push any remaining operators onto the postfix stack.
+ while (!InfixOperatorStack.empty()) {
+ InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+ if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+
+ if (PostfixStack.empty())
+ return 0;
+
+ SmallVector<ICToken, 16> OperandStack;
+ for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+ ICToken Op = PostfixStack[i];
+ if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+ OperandStack.push_back(Op);
+ } else {
+ assert (OperandStack.size() > 1 && "Too few operands.");
+ int64_t Val;
+ ICToken Op2 = OperandStack.pop_back_val();
+ ICToken Op1 = OperandStack.pop_back_val();
+ switch (Op.first) {
+ default:
+ report_fatal_error("Unexpected operator!");
+ break;
+ case IC_PLUS:
+ Val = Op1.second + Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MINUS:
+ Val = Op1.second - Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MULTIPLY:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Multiply operation with an immediate and a register!");
+ Val = Op1.second * Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_DIVIDE:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Divide operation with an immediate and a register!");
+ assert (Op2.second != 0 && "Division by zero!");
+ Val = Op1.second / Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ }
+ }
+ }
+ assert (OperandStack.size() == 1 && "Expected a single result.");
+ return OperandStack.pop_back_val().second;
+ }
+ };
+
+ enum IntelExprState {
+ IES_PLUS,
+ IES_MINUS,
+ IES_MULTIPLY,
+ IES_DIVIDE,
+ IES_LBRAC,
+ IES_RBRAC,
+ IES_LPAREN,
+ IES_RPAREN,
+ IES_REGISTER,
+ IES_INTEGER,
+ IES_IDENTIFIER,
+ IES_ERROR
+ };
+
+ class IntelExprStateMachine {
+ IntelExprState State, PrevState;
+ unsigned BaseReg, IndexReg, TmpReg, Scale;
+ int64_t Imm;
+ const MCExpr *Sym;
+ StringRef SymName;
+ bool StopOnLBrac, AddImmPrefix;
+ InfixCalculator IC;
+ InlineAsmIdentifierInfo Info;
+ public:
+ IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+ State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+ Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac),
+ AddImmPrefix(addimmprefix) { Info.clear(); }
+
+ unsigned getBaseReg() { return BaseReg; }
+ unsigned getIndexReg() { return IndexReg; }
+ unsigned getScale() { return Scale; }
+ const MCExpr *getSym() { return Sym; }
+ StringRef getSymName() { return SymName; }
+ int64_t getImm() { return Imm + IC.execute(); }
+ bool isValidEndState() { return State == IES_RBRAC; }
+ bool getStopOnLBrac() { return StopOnLBrac; }
+ bool getAddImmPrefix() { return AddImmPrefix; }
+ bool hadError() { return State == IES_ERROR; }
+
+ InlineAsmIdentifierInfo &getIdentifierInfo() {
+ return Info;
+ }
+
+ void onPlus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onMinus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ case IES_RPAREN:
+ case IES_LBRAC:
+ case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_REGISTER:
+ State = IES_MINUS;
+ // Only push the minus operator if it is not a unary operator.
+ if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+ CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+ CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+ IC.pushOperator(IC_MINUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRegister(unsigned Reg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_LPAREN:
+ State = IES_REGISTER;
+ TmpReg = Reg;
+ IC.pushOperand(IC_REGISTER);
+ break;
+ case IES_MULTIPLY:
+ // Index Register - Scale * Register
+ if (PrevState == IES_INTEGER) {
+ assert (!IndexReg && "IndexReg already set!");
+ State = IES_REGISTER;
+ IndexReg = Reg;
+ // Get the scale and replace the 'Scale * Register' with '0'.
+ Scale = IC.popOperand();
+ IC.pushOperand(IC_IMM);
+ IC.popOperator();
+ } else {
+ State = IES_ERROR;
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ State = IES_INTEGER;
+ Sym = SymRef;
+ SymName = SymRefName;
+ IC.pushOperand(IC_IMM);
+ break;
+ }
+ }
+ void onInteger(int64_t TmpInt) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_DIVIDE:
+ case IES_MULTIPLY:
+ case IES_LPAREN:
+ State = IES_INTEGER;
+ if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // Index Register - Register * Scale
+ assert (!IndexReg && "IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = TmpInt;
+ // Get the scale and replace the 'Register * Scale' with '0'.
+ IC.popOperator();
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
+ CurrState == IES_MINUS) {
+ // Unary minus. No need to pop the minus operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+ } else {
+ IC.pushOperand(IC_IMM, TmpInt);
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onStar() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_MULTIPLY;
+ IC.pushOperator(IC_MULTIPLY);
+ break;
+ }
+ }
+ void onDivide() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_DIVIDE;
+ IC.pushOperator(IC_DIVIDE);
+ break;
+ }
+ }
+ void onLBrac() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_RBRAC:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ break;
+ }
+ }
+ void onRBrac() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RBRAC;
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLParen() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ // FIXME: We don't handle this type of unary minus, yet.
+ if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
+ CurrState == IES_MINUS) {
+ State = IES_ERROR;
+ break;
+ }
+ State = IES_LPAREN;
+ IC.pushOperator(IC_LPAREN);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRParen() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RPAREN;
+ IC.pushOperator(IC_RPAREN);
+ break;
+ }
+ }
+ };
+
+ MCAsmParser &getParser() const { return Parser; }
+
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+ bool Error(SMLoc L, const Twine &Msg,
+ ArrayRef<SMRange> Ranges = None,
+ bool MatchingInlineAsm = false) {
+ if (MatchingInlineAsm) return true;
+ return Parser.Error(L, Msg, Ranges);
+ }
+
+ X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
+ Error(Loc, Msg);
+ return 0;
+ }
+
+ X86Operand *ParseOperand();
+ X86Operand *ParseATTOperand();
+ X86Operand *ParseIntelOperand();
+ X86Operand *ParseIntelOffsetOfOperator();
+ X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+ X86Operand *ParseIntelOperator(unsigned OpKind);
+ X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp,
+ SMLoc StartLoc);
+ X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+ X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp, unsigned Size);
+ X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End);
+
+ X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+ X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info);
+
+ bool ParseDirectiveWord(unsigned Size, SMLoc L);
+ bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+ bool processInstruction(MCInst &Inst,
+ const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+ MCStreamer &Out, unsigned &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
+ /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
+ bool isSrcOp(X86Operand &Op);
+
+ /// isDstOp - Returns true if operand is either (%rdi) or %es:(%rdi)
+ /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode.
+ bool isDstOp(X86Operand &Op);
+
+ bool is64BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+ }
+ void SwitchMode() {
+ unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit));
+ setAvailableFeatures(FB);
+ }
+
+ bool isParsingIntelSyntax() {
+ return getParser().getAssemblerDialect();
+ }
+
+ /// @name Auto-generated Matcher Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+ /// }
+
+public:
+ X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
+ : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ }
+ virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+ virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc,
+ SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+ virtual bool ParseDirective(AsmToken DirectiveID);
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool isImmSExti16i8Value(uint64_t Value) {
+ return (( Value <= 0x000000000000007FULL)||
+ (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmSExti32i8Value(uint64_t Value) {
+ return (( Value <= 0x000000000000007FULL)||
+ (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmZExtu32u8Value(uint64_t Value) {
+ return (Value <= 0x00000000000000FFULL);
+}
+
+static bool isImmSExti64i8Value(uint64_t Value) {
+ return (( Value <= 0x000000000000007FULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmSExti64i32Value(uint64_t Value) {
+ return (( Value <= 0x000000007FFFFFFFULL)||
+ (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+namespace {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Register,
+ Immediate,
+ Memory
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+ SMLoc OffsetOfLoc;
+ StringRef SymName;
+ void *OpDecl;
+ bool AddressOf;
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNo;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ };
+
+ struct MemOp {
+ unsigned SegReg;
+ const MCExpr *Disp;
+ unsigned BaseReg;
+ unsigned IndexReg;
+ unsigned Scale;
+ unsigned Size;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ };
+
+ X86Operand(KindTy K, SMLoc Start, SMLoc End)
+ : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+ StringRef getSymName() { return SymName; }
+ void *getOpDecl() { return OpDecl; }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const { return EndLoc; }
+ /// getLocRange - Get the range between the first and last token of this
+ /// operand.
+ SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+ /// getOffsetOfLoc - Get the location of the offset operator.
+ SMLoc getOffsetOfLoc() const { return OffsetOfLoc; }
+
+ virtual void print(raw_ostream &OS) const {}
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ void setTokenValue(StringRef Value) {
+ assert(Kind == Token && "Invalid access!");
+ Tok.Data = Value.data();
+ Tok.Length = Value.size();
+ }
+
+ unsigned getReg() const {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNo;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getMemDisp() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Disp;
+ }
+ unsigned getMemSegReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.SegReg;
+ }
+ unsigned getMemBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.BaseReg;
+ }
+ unsigned getMemIndexReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg;
+ }
+ unsigned getMemScale() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Scale;
+ }
+
+ bool isToken() const {return Kind == Token; }
+
+ bool isImm() const { return Kind == Immediate; }
+
+ bool isImmSExti16i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti16i8Value(CE->getValue());
+ }
+ bool isImmSExti32i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti32i8Value(CE->getValue());
+ }
+ bool isImmZExtu32u8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmZExtu32u8Value(CE->getValue());
+ }
+ bool isImmSExti64i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i8Value(CE->getValue());
+ }
+ bool isImmSExti64i32() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i32Value(CE->getValue());
+ }
+
+ bool isOffsetOf() const {
+ return OffsetOfLoc.getPointer();
+ }
+
+ bool needAddressOf() const {
+ return AddressOf;
+ }
+
+ bool isMem() const { return Kind == Memory; }
+ bool isMem8() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMem16() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMem32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMem64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMem80() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+ }
+ bool isMem128() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+ }
+ bool isMem256() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+ }
+
+ bool isMemVX32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+ }
+ bool isMemVY32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+ }
+ bool isMemVX64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+ }
+ bool isMemVY64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+ }
+
+ bool isAbsMem() const {
+ return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+ !getMemIndexReg() && getMemScale() == 1;
+ }
+
+ bool isReg() const { return Kind == Register; }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::CreateExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addMem8Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem16Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem80Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem128Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem256Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVX32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVY32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVX64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVY64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+
+ void addMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 5) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+ Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+ addExpr(Inst, getMemDisp());
+ Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+ }
+
+ void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+ }
+
+ static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+ SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+ X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ return Res;
+ }
+
+ static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+ bool AddressOf = false,
+ SMLoc OffsetOfLoc = SMLoc(),
+ StringRef SymName = StringRef(),
+ void *OpDecl = 0) {
+ X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+ Res->Reg.RegNo = RegNo;
+ Res->AddressOf = AddressOf;
+ Res->OffsetOfLoc = OffsetOfLoc;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ return Res;
+ }
+
+ static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+ X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+ Res->Imm.Val = Val;
+ return Res;
+ }
+
+ /// Create an absolute memory operand.
+ static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = 0) {
+ X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = 0;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = 0;
+ Res->Mem.IndexReg = 0;
+ Res->Mem.Scale = 1;
+ Res->Mem.Size = Size;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+
+ /// Create a generalized memory operand.
+ static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0,
+ StringRef SymName = StringRef(),
+ void *OpDecl = 0) {
+ // We should never just have a displacement, that should be parsed as an
+ // absolute memory operand.
+ assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+ // The scale should always be one of {1,2,4,8}.
+ assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+ "Invalid scale!");
+ X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = SegReg;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = BaseReg;
+ Res->Mem.IndexReg = IndexReg;
+ Res->Mem.Scale = Scale;
+ Res->Mem.Size = Size;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+};
+
+} // end anonymous namespace.
+
+bool X86AsmParser::isSrcOp(X86Operand &Op) {
+ unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI;
+
+ return (Op.isMem() &&
+ (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0);
+}
+
+bool X86AsmParser::isDstOp(X86Operand &Op) {
+ unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
+
+ return Op.isMem() &&
+ (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc, SMLoc &EndLoc) {
+ RegNo = 0;
+ const AsmToken &PercentTok = Parser.getTok();
+ StartLoc = PercentTok.getLoc();
+
+ // If we encounter a %, ignore it. This code handles registers with and
+ // without the prefix, unprefixed registers can occur in cfi directives.
+ if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+ Parser.Lex(); // Eat percent token.
+
+ const AsmToken &Tok = Parser.getTok();
+ EndLoc = Tok.getEndLoc();
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ RegNo = MatchRegisterName(Tok.getString());
+
+ // If the match failed, try the register name as lowercase.
+ if (RegNo == 0)
+ RegNo = MatchRegisterName(Tok.getString().lower());
+
+ if (!is64BitMode()) {
+ // FIXME: This should be done using Requires<In32BitMode> and
+ // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+ // checked.
+ // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+ // REX prefix.
+ if (RegNo == X86::RIZ ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+ X86II::isX86_64NonExtLowByteReg(RegNo) ||
+ X86II::isX86_64ExtendedReg(RegNo))
+ return Error(StartLoc, "register %"
+ + Tok.getString() + " is only available in 64-bit mode",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+ if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+ RegNo = X86::ST0;
+ Parser.Lex(); // Eat 'st'
+
+ // Check to see if we have '(4)' after %st.
+ if (getLexer().isNot(AsmToken::LParen))
+ return false;
+ // Lex the paren.
+ getParser().Lex();
+
+ const AsmToken &IntTok = Parser.getTok();
+ if (IntTok.isNot(AsmToken::Integer))
+ return Error(IntTok.getLoc(), "expected stack index");
+ switch (IntTok.getIntVal()) {
+ case 0: RegNo = X86::ST0; break;
+ case 1: RegNo = X86::ST1; break;
+ case 2: RegNo = X86::ST2; break;
+ case 3: RegNo = X86::ST3; break;
+ case 4: RegNo = X86::ST4; break;
+ case 5: RegNo = X86::ST5; break;
+ case 6: RegNo = X86::ST6; break;
+ case 7: RegNo = X86::ST7; break;
+ default: return Error(IntTok.getLoc(), "invalid stack index");
+ }
+
+ if (getParser().Lex().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "expected ')'");
+
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat ')'
+ return false;
+ }
+
+ EndLoc = Parser.getTok().getEndLoc();
+
+ // If this is "db[0-7]", match it as an alias
+ // for dr[0-7].
+ if (RegNo == 0 && Tok.getString().size() == 3 &&
+ Tok.getString().startswith("db")) {
+ switch (Tok.getString()[2]) {
+ case '0': RegNo = X86::DR0; break;
+ case '1': RegNo = X86::DR1; break;
+ case '2': RegNo = X86::DR2; break;
+ case '3': RegNo = X86::DR3; break;
+ case '4': RegNo = X86::DR4; break;
+ case '5': RegNo = X86::DR5; break;
+ case '6': RegNo = X86::DR6; break;
+ case '7': RegNo = X86::DR7; break;
+ }
+
+ if (RegNo != 0) {
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat it.
+ return false;
+ }
+ }
+
+ if (RegNo == 0) {
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ return false;
+}
+
+X86Operand *X86AsmParser::ParseOperand() {
+ if (isParsingIntelSyntax())
+ return ParseIntelOperand();
+ return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+ unsigned Size = StringSwitch<unsigned>(OpStr)
+ .Cases("BYTE", "byte", 8)
+ .Cases("WORD", "word", 16)
+ .Cases("DWORD", "dword", 32)
+ .Cases("QWORD", "qword", 64)
+ .Cases("XWORD", "xword", 80)
+ .Cases("XMMWORD", "xmmword", 128)
+ .Cases("YMMWORD", "ymmword", 256)
+ .Default(0);
+ return Size;
+}
+
+X86Operand *
+X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info){
+ if (isa<MCSymbolRefExpr>(Disp)) {
+ // If this is not a VarDecl then assume it is a FuncDecl or some other label
+ // reference. We need an 'r' constraint here, so we need to create register
+ // operand to ensure proper matching. Just pick a GPR based on the size of
+ // a pointer.
+ if (!Info.IsVarDecl) {
+ unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+ return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true,
+ SMLoc(), Identifier, Info.OpDecl);
+ }
+ if (!Size) {
+ Size = Info.Type * 8; // Size is in terms of bits in this context.
+ if (Size)
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
+ /*Len=*/0, Size));
+ }
+ }
+
+ // When parsing inline assembly we set the base register to a non-zero value
+ // if we don't know the actual value at this time. This is necessary to
+ // get the matching correct in some cases.
+ BaseReg = BaseReg ? BaseReg : 1;
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, Identifier, Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
+ StringRef SymName, int64_t ImmDisp,
+ int64_t FinalImmDisp, SMLoc &BracLoc,
+ SMLoc &StartInBrac, SMLoc &End) {
+ // Remove the '[' and ']' from the IR string.
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1));
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1));
+
+ // If ImmDisp is non-zero, then we parsed a displacement before the
+ // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+ // If ImmDisp doesn't match the displacement computed by the state machine
+ // then we have an additional displacement in the bracketed expression.
+ if (ImmDisp != FinalImmDisp) {
+ if (ImmDisp) {
+ // We have an immediate displacement before the bracketed expression.
+ // Adjust this to match the final immediate displacement.
+ bool Found = false;
+ for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
+ E = AsmRewrites->end(); I != E; ++I) {
+ if ((*I).Loc.getPointer() > BracLoc.getPointer())
+ continue;
+ if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) {
+ assert (!Found && "ImmDisp already rewritten.");
+ (*I).Kind = AOK_Imm;
+ (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer();
+ (*I).Val = FinalImmDisp;
+ Found = true;
+ break;
+ }
+ }
+ assert (Found && "Unable to rewrite ImmDisp.");
+ } else {
+ // We have a symbolic and an immediate displacement, but no displacement
+ // before the bracketed expression. Put the immediate displacement
+ // before the bracketed expression.
+ AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp));
+ }
+ }
+ // Remove all the ImmPrefix rewrites within the brackets.
+ for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
+ E = AsmRewrites->end(); I != E; ++I) {
+ if ((*I).Loc.getPointer() < StartInBrac.getPointer())
+ continue;
+ if ((*I).Kind == AOK_ImmPrefix)
+ (*I).Kind = AOK_Delete;
+ }
+ const char *SymLocPtr = SymName.data();
+ // Skip everything before the symbol.
+ if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len));
+ }
+ // Skip everything after the symbol.
+ if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+ SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len));
+ }
+}
+
+X86Operand *
+X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+ const AsmToken &Tok = Parser.getTok();
+
+ bool Done = false;
+ while (!Done) {
+ bool UpdateLocLex = true;
+
+ // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+ // identifier. Don't try an parse it as a register.
+ if (Tok.getString().startswith("."))
+ break;
+
+ // If we're parsing an immediate expression, we don't expect a '['.
+ if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+ break;
+
+ switch (getLexer().getKind()) {
+ default: {
+ if (SM.isValidEndState()) {
+ Done = true;
+ break;
+ }
+ return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+ }
+ case AsmToken::EndOfStatement: {
+ Done = true;
+ break;
+ }
+ case AsmToken::Identifier: {
+ // This could be a register or a symbolic displacement.
+ unsigned TmpReg;
+ const MCExpr *Val;
+ SMLoc IdentLoc = Tok.getLoc();
+ StringRef Identifier = Tok.getString();
+ if(!ParseRegister(TmpReg, IdentLoc, End)) {
+ SM.onRegister(TmpReg);
+ UpdateLocLex = false;
+ break;
+ } else {
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+ } else {
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ false, End))
+ return Err;
+ }
+ SM.onIdentifierExpr(Val, Identifier);
+ UpdateLocLex = false;
+ break;
+ }
+ return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+ }
+ case AsmToken::Integer:
+ if (isParsingInlineAsm() && SM.getAddImmPrefix())
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
+ Tok.getLoc()));
+ SM.onInteger(Tok.getIntVal());
+ break;
+ case AsmToken::Plus: SM.onPlus(); break;
+ case AsmToken::Minus: SM.onMinus(); break;
+ case AsmToken::Star: SM.onStar(); break;
+ case AsmToken::Slash: SM.onDivide(); break;
+ case AsmToken::LBrac: SM.onLBrac(); break;
+ case AsmToken::RBrac: SM.onRBrac(); break;
+ case AsmToken::LParen: SM.onLParen(); break;
+ case AsmToken::RParen: SM.onRParen(); break;
+ }
+ if (SM.hadError())
+ return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+
+ if (!Done && UpdateLocLex) {
+ End = Tok.getLoc();
+ Parser.Lex(); // Consume the token.
+ }
+ }
+ return 0;
+}
+
+X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp,
+ unsigned Size) {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(BracLoc, "Expected '[' token!");
+ Parser.Lex(); // Eat '['
+
+ SMLoc StartInBrac = Tok.getLoc();
+ // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We
+ // may have already parsed an immediate displacement before the bracketed
+ // expression.
+ IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+ if (X86Operand *Err = ParseIntelExpression(SM, End))
+ return Err;
+
+ const MCExpr *Disp;
+ if (const MCExpr *Sym = SM.getSym()) {
+ // A symbolic displacement.
+ Disp = Sym;
+ if (isParsingInlineAsm())
+ RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(),
+ ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+ End);
+ } else {
+ // An immediate displacement only.
+ Disp = MCConstantExpr::Create(SM.getImm(), getContext());
+ }
+
+ // Parse the dot operator (e.g., [ebx].foo.bar).
+ if (Tok.getString().startswith(".")) {
+ const MCExpr *NewDisp;
+ if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp))
+ return Err;
+
+ End = Tok.getEndLoc();
+ Parser.Lex(); // Eat the field.
+ Disp = NewDisp;
+ }
+
+ int BaseReg = SM.getBaseReg();
+ int IndexReg = SM.getIndexReg();
+ int Scale = SM.getScale();
+ if (!isParsingInlineAsm()) {
+ // handle [-42]
+ if (!BaseReg && !IndexReg) {
+ if (!SegReg)
+ return X86Operand::CreateMem(Disp, Start, End, Size);
+ else
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+ }
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size);
+ }
+
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, SM.getSymName(), Info);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+ StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand,
+ SMLoc &End) {
+ assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+ Val = 0;
+
+ StringRef LineBuf(Identifier.data());
+ SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+ const AsmToken &Tok = Parser.getTok();
+
+ // Advance the token stream until the end of the current token is
+ // after the end of what the frontend claimed.
+ const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+ while (true) {
+ End = Tok.getEndLoc();
+ getLexer().Lex();
+
+ assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
+ if (End.getPointer() == EndPtr) break;
+ }
+
+ // Create the symbol reference.
+ Identifier = LineBuf;
+ MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
+ return 0;
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
+ int64_t ImmDisp,
+ SMLoc Start) {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc End;
+
+ unsigned Size = getIntelMemOperandSize(Tok.getString());
+ if (Size) {
+ Parser.Lex(); // Eat operand size (e.g., byte, word).
+ if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+ return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+ Parser.Lex(); // Eat ptr.
+ }
+
+ // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+ if (getLexer().is(AsmToken::Integer)) {
+ if (isParsingInlineAsm())
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
+ Tok.getLoc()));
+ int64_t ImmDisp = Tok.getIntVal();
+ Parser.Lex(); // Eat the integer.
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(Start, "Expected '[' token!");
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+ }
+
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+
+ if (!ParseRegister(SegReg, Start, End)) {
+ // Handel SegReg : [ ... ]
+ if (getLexer().isNot(AsmToken::Colon))
+ return ErrorOperand(Start, "Expected ':' token!");
+ Parser.Lex(); // Eat :
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(Start, "Expected '[' token!");
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+ }
+
+ const MCExpr *Val;
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+
+ return X86Operand::CreateMem(Val, Start, End, Size);
+ }
+
+ InlineAsmIdentifierInfo Info;
+ StringRef Identifier = Tok.getString();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ false, End))
+ return Err;
+ return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+/// Parse the '.' operator.
+X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+ const MCExpr *&NewDisp) {
+ const AsmToken &Tok = Parser.getTok();
+ int64_t OrigDispVal, DotDispVal;
+
+ // FIXME: Handle non-constant expressions.
+ if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
+ OrigDispVal = OrigDisp->getValue();
+ else
+ return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!");
+
+ // Drop the '.'.
+ StringRef DotDispStr = Tok.getString().drop_front(1);
+
+ // .Imm gets lexed as a real.
+ if (Tok.is(AsmToken::Real)) {
+ APInt DotDisp;
+ DotDispStr.getAsInteger(10, DotDisp);
+ DotDispVal = DotDisp.getZExtValue();
+ } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+ unsigned DotDisp;
+ std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+ if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+ DotDisp))
+ return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!");
+ DotDispVal = DotDisp;
+ } else
+ return ErrorOperand(Tok.getLoc(), "Unexpected token type!");
+
+ if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+ SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+ unsigned Len = DotDispStr.size();
+ unsigned Val = OrigDispVal + DotDispVal;
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
+ Val));
+ }
+
+ NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+ return 0;
+}
+
+/// Parse the 'offset' operator. This operator is used to specify the
+/// location rather then the content of a variable.
+X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc OffsetOfLoc = Tok.getLoc();
+ Parser.Lex(); // Eat offset.
+
+ const MCExpr *Val;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ false, End))
+ return Err;
+
+ // Don't emit the offset operator.
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+
+ // The offset operator will have an 'r' constraint, thus we need to create
+ // register operand to ensure proper matching. Just pick a GPR based on
+ // the size of a pointer.
+ unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+ return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+ OffsetOfLoc, Identifier, Info.OpDecl);
+}
+
+enum IntelOperatorKind {
+ IOK_LENGTH,
+ IOK_SIZE,
+ IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZE operator returns the size of a C or C++
+/// variable. A variable's size is the product of its LENGTH and TYPE. The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc TypeLoc = Tok.getLoc();
+ Parser.Lex(); // Eat operator.
+
+ const MCExpr *Val = 0;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ true, End))
+ return Err;
+
+ unsigned CVal = 0;
+ switch(OpKind) {
+ default: llvm_unreachable("Unexpected operand kind!");
+ case IOK_LENGTH: CVal = Info.Length; break;
+ case IOK_SIZE: CVal = Info.Size; break;
+ case IOK_TYPE: CVal = Info.Type; break;
+ }
+
+ // Rewrite the type operator and the C or C++ type or variable in terms of an
+ // immediate. E.g. TYPE foo -> $$4
+ unsigned Len = End.getPointer() - TypeLoc.getPointer();
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
+
+ const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
+ return X86Operand::CreateImm(Imm, Start, End);
+}
+
+X86Operand *X86AsmParser::ParseIntelOperand() {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Start = Tok.getLoc(), End;
+
+ // Offset, length, type and size operators.
+ if (isParsingInlineAsm()) {
+ StringRef AsmTokStr = Tok.getString();
+ if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+ return ParseIntelOffsetOfOperator();
+ if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+ return ParseIntelOperator(IOK_LENGTH);
+ if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+ return ParseIntelOperator(IOK_SIZE);
+ if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+ return ParseIntelOperator(IOK_TYPE);
+ }
+
+ // Immediate.
+ if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
+ getLexer().is(AsmToken::LParen)) {
+ AsmToken StartTok = Tok;
+ IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+ /*AddImmPrefix=*/false);
+ if (X86Operand *Err = ParseIntelExpression(SM, End))
+ return Err;
+
+ int64_t Imm = SM.getImm();
+ if (isParsingInlineAsm()) {
+ unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+ if (StartTok.getString().size() == Len)
+ // Just add a prefix if this wasn't a complex immediate expression.
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start));
+ else
+ // Otherwise, rewrite the complex expression as a single immediate.
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm));
+ }
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext());
+ return X86Operand::CreateImm(ImmExpr, Start, End);
+ }
+
+ // Only positive immediates are valid.
+ if (Imm < 0)
+ return ErrorOperand(Start, "expected a positive immediate displacement "
+ "before bracketed expr.");
+
+ // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+ return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start);
+ }
+
+ // Register.
+ unsigned RegNo = 0;
+ if (!ParseRegister(RegNo, Start, End)) {
+ // If this is a segment register followed by a ':', then this is the start
+ // of a memory reference, otherwise this is a normal register reference.
+ if (getLexer().isNot(AsmToken::Colon))
+ return X86Operand::CreateReg(RegNo, Start, End);
+
+ getParser().Lex(); // Eat the colon.
+ return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start);
+ }
+
+ // Memory operand.
+ return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start);
+}
+
+X86Operand *X86AsmParser::ParseATTOperand() {
+ switch (getLexer().getKind()) {
+ default:
+ // Parse a memory operand with no segment register.
+ return ParseMemOperand(0, Parser.getTok().getLoc());
+ case AsmToken::Percent: {
+ // Read the register.
+ unsigned RegNo;
+ SMLoc Start, End;
+ if (ParseRegister(RegNo, Start, End)) return 0;
+ if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+ Error(Start, "%eiz and %riz can only be used as index registers",
+ SMRange(Start, End));
+ return 0;
+ }
+
+ // If this is a segment register followed by a ':', then this is the start
+ // of a memory reference, otherwise this is a normal register reference.
+ if (getLexer().isNot(AsmToken::Colon))
+ return X86Operand::CreateReg(RegNo, Start, End);
+
+ getParser().Lex(); // Eat the colon.
+ return ParseMemOperand(RegNo, Start);
+ }
+ case AsmToken::Dollar: {
+ // $42 -> immediate.
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ Parser.Lex();
+ const MCExpr *Val;
+ if (getParser().parseExpression(Val, End))
+ return 0;
+ return X86Operand::CreateImm(Val, Start, End);
+ }
+ }
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
+/// has already been parsed if present.
+X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+
+ // We have to disambiguate a parenthesized expression "(4+5)" from the start
+ // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
+ // only way to do this without lookahead is to eat the '(' and see what is
+ // after it.
+ const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+ if (getLexer().isNot(AsmToken::LParen)) {
+ SMLoc ExprEnd;
+ if (getParser().parseExpression(Disp, ExprEnd)) return 0;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ } else {
+ // Okay, we have a '('. We don't know if this is an expression or not, but
+ // so we have to eat the ( to see beyond it.
+ SMLoc LParenLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat the '('.
+
+ if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+ // Nothing to do here, fall into the code below with the '(' part of the
+ // memory operand consumed.
+ } else {
+ SMLoc ExprEnd;
+
+ // It must be an parenthesized expression, parse it now.
+ if (getParser().parseParenExpression(Disp, ExprEnd))
+ return 0;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ }
+ }
+
+ // If we reached here, then we just ate the ( of the memory operand. Process
+ // the rest of the memory operand.
+ unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+ SMLoc IndexLoc;
+
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc StartLoc, EndLoc;
+ if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0;
+ if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+ Error(StartLoc, "eiz and riz can only be used as index registers",
+ SMRange(StartLoc, EndLoc));
+ return 0;
+ }
+ }
+
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+ IndexLoc = Parser.getTok().getLoc();
+
+ // Following the comma we should have either an index register, or a scale
+ // value. We don't support the later form, but we want to parse it
+ // correctly.
+ //
+ // Not that even though it would be completely consistent to support syntax
+ // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc L;
+ if (ParseRegister(IndexReg, L, L)) return 0;
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ // Parse the scale amount:
+ // ::= ',' [scale-expression]
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(Parser.getTok().getLoc(),
+ "expected comma in scale expression");
+ return 0;
+ }
+ Parser.Lex(); // Eat the comma.
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t ScaleVal;
+ if (getParser().parseAbsoluteExpression(ScaleVal)){
+ Error(Loc, "expected scale expression");
+ return 0;
+ }
+
+ // Validate the scale amount.
+ if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+ Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+ return 0;
+ }
+ Scale = (unsigned)ScaleVal;
+ }
+ }
+ } else if (getLexer().isNot(AsmToken::RParen)) {
+ // A scale amount without an index is ignored.
+ // index.
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t Value;
+ if (getParser().parseAbsoluteExpression(Value))
+ return 0;
+
+ if (Value != 1)
+ Warning(Loc, "scale factor without index register is ignored");
+ Scale = 1;
+ }
+ }
+
+ // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+ return 0;
+ }
+ SMLoc MemEnd = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat the ')'.
+
+ // If we have both a base register and an index register make sure they are
+ // both 64-bit or 32-bit registers.
+ // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+ if (BaseReg != 0 && IndexReg != 0) {
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+ IndexReg != X86::RIZ) {
+ Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
+ return 0;
+ }
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+ IndexReg != X86::EIZ){
+ Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
+ return 0;
+ }
+ }
+
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+ MemStart, MemEnd);
+}
+
+bool X86AsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+ SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ InstInfo = &Info;
+ StringRef PatchedName = Name;
+
+ // FIXME: Hack to recognize setneb as setne.
+ if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+ PatchedName != "setb" && PatchedName != "setnb")
+ PatchedName = PatchedName.substr(0, Name.size()-1);
+
+ // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+ const MCExpr *ExtraImmOp = 0;
+ if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+ (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+ PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+ bool IsVCMP = PatchedName[0] == 'v';
+ unsigned SSECCIdx = IsVCMP ? 4 : 3;
+ unsigned SSEComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(SSECCIdx, PatchedName.size() - 2))
+ .Case("eq", 0x00)
+ .Case("lt", 0x01)
+ .Case("le", 0x02)
+ .Case("unord", 0x03)
+ .Case("neq", 0x04)
+ .Case("nlt", 0x05)
+ .Case("nle", 0x06)
+ .Case("ord", 0x07)
+ /* AVX only from here */
+ .Case("eq_uq", 0x08)
+ .Case("nge", 0x09)
+ .Case("ngt", 0x0A)
+ .Case("false", 0x0B)
+ .Case("neq_oq", 0x0C)
+ .Case("ge", 0x0D)
+ .Case("gt", 0x0E)
+ .Case("true", 0x0F)
+ .Case("eq_os", 0x10)
+ .Case("lt_oq", 0x11)
+ .Case("le_oq", 0x12)
+ .Case("unord_s", 0x13)
+ .Case("neq_us", 0x14)
+ .Case("nlt_uq", 0x15)
+ .Case("nle_uq", 0x16)
+ .Case("ord_s", 0x17)
+ .Case("eq_us", 0x18)
+ .Case("nge_uq", 0x19)
+ .Case("ngt_uq", 0x1A)
+ .Case("false_os", 0x1B)
+ .Case("neq_os", 0x1C)
+ .Case("ge_oq", 0x1D)
+ .Case("gt_oq", 0x1E)
+ .Case("true_us", 0x1F)
+ .Default(~0U);
+ if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) {
+ ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
+ getParser().getContext());
+ if (PatchedName.endswith("ss")) {
+ PatchedName = IsVCMP ? "vcmpss" : "cmpss";
+ } else if (PatchedName.endswith("sd")) {
+ PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
+ } else if (PatchedName.endswith("ps")) {
+ PatchedName = IsVCMP ? "vcmpps" : "cmpps";
+ } else {
+ assert(PatchedName.endswith("pd") && "Unexpected mnemonic!");
+ PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+ }
+ }
+ }
+
+ Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+ if (ExtraImmOp && !isParsingIntelSyntax())
+ Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+ // Determine whether this is an instruction prefix.
+ bool isPrefix =
+ Name == "lock" || Name == "rep" ||
+ Name == "repe" || Name == "repz" ||
+ Name == "repne" || Name == "repnz" ||
+ Name == "rex64" || Name == "data16";
+
+
+ // This does the actual operand parsing. Don't parse any more if we have a
+ // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+ // just want to parse the "lock" as the first instruction and the "incl" as
+ // the next one.
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+ // Parse '*' modifier.
+ if (getLexer().is(AsmToken::Star)) {
+ SMLoc Loc = Parser.getTok().getLoc();
+ Operands.push_back(X86Operand::CreateToken("*", Loc));
+ Parser.Lex(); // Eat the star.
+ }
+
+ // Read the first operand.
+ if (X86Operand *Op = ParseOperand())
+ Operands.push_back(Op);
+ else {
+ Parser.eatToEndOfStatement();
+ return true;
+ }
+
+ while (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+
+ // Parse and remember the operand.
+ if (X86Operand *Op = ParseOperand())
+ Operands.push_back(Op);
+ else {
+ Parser.eatToEndOfStatement();
+ return true;
+ }
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ SMLoc Loc = getLexer().getLoc();
+ Parser.eatToEndOfStatement();
+ return Error(Loc, "unexpected token in argument list");
+ }
+ }
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ Parser.Lex(); // Consume the EndOfStatement
+ else if (isPrefix && getLexer().is(AsmToken::Slash))
+ Parser.Lex(); // Consume the prefix separator Slash
+
+ if (ExtraImmOp && isParsingIntelSyntax())
+ Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+ // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
+ // "outb %al, %dx". Out doesn't take a memory form, but this is a widely
+ // documented form in various unofficial manuals, so a lot of code uses it.
+ if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+ Operands.size() == 3) {
+ X86Operand &Op = *(X86Operand*)Operands.back();
+ if (Op.isMem() && Op.Mem.SegReg == 0 &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+ SMLoc Loc = Op.getEndLoc();
+ Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+ delete &Op;
+ }
+ }
+ // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
+ if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+ Operands.size() == 3) {
+ X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+ if (Op.isMem() && Op.Mem.SegReg == 0 &&
+ isa<MCConstantExpr>(Op.Mem.Disp) &&
+ cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+ Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+ SMLoc Loc = Op.getEndLoc();
+ Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+ delete &Op;
+ }
+ }
+ // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]"
+ if (Name.startswith("ins") && Operands.size() == 3 &&
+ (Name == "insb" || Name == "insw" || Name == "insl")) {
+ X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+ X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+ if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) {
+ Operands.pop_back();
+ Operands.pop_back();
+ delete &Op;
+ delete &Op2;
+ }
+ }
+
+ // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]"
+ if (Name.startswith("outs") && Operands.size() == 3 &&
+ (Name == "outsb" || Name == "outsw" || Name == "outsl")) {
+ X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+ X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+ if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) {
+ Operands.pop_back();
+ Operands.pop_back();
+ delete &Op;
+ delete &Op2;
+ }
+ }
+
+ // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]"
+ if (Name.startswith("movs") && Operands.size() == 3 &&
+ (Name == "movsb" || Name == "movsw" || Name == "movsl" ||
+ (is64BitMode() && Name == "movsq"))) {
+ X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+ X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+ if (isSrcOp(Op) && isDstOp(Op2)) {
+ Operands.pop_back();
+ Operands.pop_back();
+ delete &Op;
+ delete &Op2;
+ }
+ }
+ // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]"
+ if (Name.startswith("lods") && Operands.size() == 3 &&
+ (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+ Name == "lodsl" || (is64BitMode() && Name == "lodsq"))) {
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+ X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
+ if (isSrcOp(*Op1) && Op2->isReg()) {
+ const char *ins;
+ unsigned reg = Op2->getReg();
+ bool isLods = Name == "lods";
+ if (reg == X86::AL && (isLods || Name == "lodsb"))
+ ins = "lodsb";
+ else if (reg == X86::AX && (isLods || Name == "lodsw"))
+ ins = "lodsw";
+ else if (reg == X86::EAX && (isLods || Name == "lodsl"))
+ ins = "lodsl";
+ else if (reg == X86::RAX && (isLods || Name == "lodsq"))
+ ins = "lodsq";
+ else
+ ins = NULL;
+ if (ins != NULL) {
+ Operands.pop_back();
+ Operands.pop_back();
+ delete Op1;
+ delete Op2;
+ if (Name != ins)
+ static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+ }
+ }
+ }
+ // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]"
+ if (Name.startswith("stos") && Operands.size() == 3 &&
+ (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+ Name == "stosl" || (is64BitMode() && Name == "stosq"))) {
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+ X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
+ if (isDstOp(*Op2) && Op1->isReg()) {
+ const char *ins;
+ unsigned reg = Op1->getReg();
+ bool isStos = Name == "stos";
+ if (reg == X86::AL && (isStos || Name == "stosb"))
+ ins = "stosb";
+ else if (reg == X86::AX && (isStos || Name == "stosw"))
+ ins = "stosw";
+ else if (reg == X86::EAX && (isStos || Name == "stosl"))
+ ins = "stosl";
+ else if (reg == X86::RAX && (isStos || Name == "stosq"))
+ ins = "stosq";
+ else
+ ins = NULL;
+ if (ins != NULL) {
+ Operands.pop_back();
+ Operands.pop_back();
+ delete Op1;
+ delete Op2;
+ if (Name != ins)
+ static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+ }
+ }
+ }
+
+ // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to
+ // "shift <op>".
+ if ((Name.startswith("shr") || Name.startswith("sar") ||
+ Name.startswith("shl") || Name.startswith("sal") ||
+ Name.startswith("rcl") || Name.startswith("rcr") ||
+ Name.startswith("rol") || Name.startswith("ror")) &&
+ Operands.size() == 3) {
+ if (isParsingIntelSyntax()) {
+ // Intel syntax
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
+ if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+ delete Operands[2];
+ Operands.pop_back();
+ }
+ } else {
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+ if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+ delete Operands[1];
+ Operands.erase(Operands.begin() + 1);
+ }
+ }
+ }
+
+ // Transforms "int $3" into "int3" as a size optimization. We can't write an
+ // instalias with an immediate operand yet.
+ if (Name == "int" && Operands.size() == 2) {
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+ if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
+ delete Operands[1];
+ Operands.erase(Operands.begin() + 1);
+ static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+ }
+ }
+
+ return false;
+}
+
+static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg,
+ bool isCmp) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ if (!isCmp)
+ TmpInst.addOperand(MCOperand::CreateReg(Reg));
+ TmpInst.addOperand(MCOperand::CreateReg(Reg));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+}
+
+static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode,
+ bool isCmp = false) {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ return convertToSExti8(Inst, Opcode, X86::AX, isCmp);
+}
+
+static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode,
+ bool isCmp = false) {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ return convertToSExti8(Inst, Opcode, X86::EAX, isCmp);
+}
+
+static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
+ bool isCmp = false) {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
+}
+
+bool X86AsmParser::
+processInstruction(MCInst &Inst,
+ const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+ switch (Inst.getOpcode()) {
+ default: return false;
+ case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
+ case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8);
+ case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8);
+ case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8);
+ case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8);
+ case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8);
+ case X86::OR16i16: return convert16i16to16ri8(Inst, X86::OR16ri8);
+ case X86::OR32i32: return convert32i32to32ri8(Inst, X86::OR32ri8);
+ case X86::OR64i32: return convert64i32to64ri8(Inst, X86::OR64ri8);
+ case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true);
+ case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true);
+ case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true);
+ case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8);
+ case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8);
+ case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8);
+ case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8);
+ case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8);
+ case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8);
+ case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8);
+ case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8);
+ case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8);
+ case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
+ case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
+ case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+ }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+bool X86AsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+ MCStreamer &Out, unsigned &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
+ assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+ ArrayRef<SMRange> EmptyRanges = None;
+
+ // First, handle aliases that expand to multiple instructions.
+ // FIXME: This should be replaced with a real .td file alias mechanism.
+ // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+ // call.
+ if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
+ Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
+ Op->getToken() == "finit" || Op->getToken() == "fsave" ||
+ Op->getToken() == "fstenv" || Op->getToken() == "fclex") {
+ MCInst Inst;
+ Inst.setOpcode(X86::WAIT);
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ Out.EmitInstruction(Inst);
+
+ const char *Repl =
+ StringSwitch<const char*>(Op->getToken())
+ .Case("finit", "fninit")
+ .Case("fsave", "fnsave")
+ .Case("fstcw", "fnstcw")
+ .Case("fstcww", "fnstcw")
+ .Case("fstenv", "fnstenv")
+ .Case("fstsw", "fnstsw")
+ .Case("fstsww", "fnstsw")
+ .Case("fclex", "fnclex")
+ .Default(0);
+ assert(Repl && "Unknown wait-prefixed instruction");
+ delete Operands[0];
+ Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+ }
+
+ bool WasOriginallyInvalidOperand = false;
+ MCInst Inst;
+
+ // First, try a direct match.
+ switch (MatchInstructionImpl(Operands, Inst,
+ ErrorInfo, MatchingInlineAsm,
+ isParsingIntelSyntax())) {
+ default: break;
+ case Match_Success:
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ Out.EmitInstruction(Inst);
+ Opcode = Inst.getOpcode();
+ return false;
+ case Match_MissingFeature: {
+ assert(ErrorInfo && "Unknown missing feature!");
+ // Special case the error message for the very common case where only
+ // a single subtarget feature is missing.
+ std::string Msg = "instruction requires:";
+ unsigned Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+ if (ErrorInfo & Mask) {
+ Msg += " ";
+ Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ }
+ Mask <<= 1;
+ }
+ return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+ }
+ case Match_InvalidOperand:
+ WasOriginallyInvalidOperand = true;
+ break;
+ case Match_MnemonicFail:
+ break;
+ }
+
+ // FIXME: Ideally, we would only attempt suffix matches for things which are
+ // valid prefixes, and we could just infer the right unambiguous
+ // type. However, that requires substantially more matcher support than the
+ // following hack.
+
+ // Change the operand to point to a temporary token.
+ StringRef Base = Op->getToken();
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += ' ';
+ Op->setTokenValue(Tmp.str());
+
+ // If this instruction starts with an 'f', then it is a floating point stack
+ // instruction. These come in up to three forms for 32-bit, 64-bit, and
+ // 80-bit floating point, which use the suffixes s,l,t respectively.
+ //
+ // Otherwise, we assume that this may be an integer instruction, which comes
+ // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+ const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+ // Check for the various suffix matches.
+ Tmp[Base.size()] = Suffixes[0];
+ unsigned ErrorInfoIgnore;
+ unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+ unsigned Match1, Match2, Match3, Match4;
+
+ Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+ isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match1 == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ Tmp[Base.size()] = Suffixes[1];
+ Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+ isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match2 == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ Tmp[Base.size()] = Suffixes[2];
+ Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+ isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match3 == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+ Tmp[Base.size()] = Suffixes[3];
+ Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+ isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match4 == Match_MissingFeature)
+ ErrorInfoMissingFeature = ErrorInfoIgnore;
+
+ // Restore the old token.
+ Op->setTokenValue(Base);
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ unsigned NumSuccessfulMatches =
+ (Match1 == Match_Success) + (Match2 == Match_Success) +
+ (Match3 == Match_Success) + (Match4 == Match_Success);
+ if (NumSuccessfulMatches == 1) {
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ Out.EmitInstruction(Inst);
+ Opcode = Inst.getOpcode();
+ return false;
+ }
+
+ // Otherwise, the match failed, try to produce a decent error message.
+
+ // If we had multiple suffix matches, then identify this as an ambiguous
+ // match.
+ if (NumSuccessfulMatches > 1) {
+ char MatchChars[4];
+ unsigned NumMatches = 0;
+ if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0];
+ if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1];
+ if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2];
+ if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3];
+
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "ambiguous instructions require an explicit suffix (could be ";
+ for (unsigned i = 0; i != NumMatches; ++i) {
+ if (i != 0)
+ OS << ", ";
+ if (i + 1 == NumMatches)
+ OS << "or ";
+ OS << "'" << Base << MatchChars[i] << "'";
+ }
+ OS << ")";
+ Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+ return true;
+ }
+
+ // Okay, we know that none of the variants matched successfully.
+
+ // If all of the instructions reported an invalid mnemonic, then the original
+ // mnemonic was invalid.
+ if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
+ (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
+ if (!WasOriginallyInvalidOperand) {
+ ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
+ Op->getLocRange();
+ return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+ Ranges, MatchingInlineAsm);
+ }
+
+ // Recover location info for the operand if we know which was the problem.
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction",
+ EmptyRanges, MatchingInlineAsm);
+
+ X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
+ if (Operand->getStartLoc().isValid()) {
+ SMRange OperandRange = Operand->getLocRange();
+ return Error(Operand->getStartLoc(), "invalid operand for instruction",
+ OperandRange, MatchingInlineAsm);
+ }
+ }
+
+ return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) +
+ (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){
+ std::string Msg = "instruction requires:";
+ unsigned Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) {
+ if (ErrorInfoMissingFeature & Mask) {
+ Msg += " ";
+ Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask);
+ }
+ Mask <<= 1;
+ }
+ return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) +
+ (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){
+ Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ MatchingInlineAsm);
+ return true;
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+ EmptyRanges, MatchingInlineAsm);
+ return true;
+}
+
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal == ".word")
+ return ParseDirectiveWord(2, DirectiveID.getLoc());
+ else if (IDVal.startswith(".code"))
+ return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+ else if (IDVal.startswith(".att_syntax")) {
+ getParser().setAssemblerDialect(0);
+ return false;
+ } else if (IDVal.startswith(".intel_syntax")) {
+ getParser().setAssemblerDialect(1);
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if(Parser.getTok().getString() == "noprefix") {
+ // FIXME : Handle noprefix
+ Parser.Lex();
+ } else
+ return true;
+ }
+ return false;
+ }
+ return true;
+}
+
+/// ParseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ if (getParser().parseExpression(Value))
+ return true;
+
+ getParser().getStreamer().EmitValue(Value, Size);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(L, "unexpected token in directive");
+ Parser.Lex();
+ }
+ }
+
+ Parser.Lex();
+ return false;
+}
+
+/// ParseDirectiveCode
+/// ::= .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+ if (IDVal == ".code32") {
+ Parser.Lex();
+ if (is64BitMode()) {
+ SwitchMode();
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ }
+ } else if (IDVal == ".code64") {
+ Parser.Lex();
+ if (!is64BitMode()) {
+ SwitchMode();
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+ }
+ } else {
+ return Error(L, "unexpected directive " + IDVal);
+ }
+
+ return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+ RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
+ RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 0000000..ca6f80c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,814 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+// MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+void x86DisassemblerDebug(const char *file,
+ unsigned line,
+ const char *s) {
+ dbgs() << file << ":" << line << ": " << s;
+}
+
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
+ const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+ return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy. These constants are never actually
+// assigned; they are just filler to make an automatically-generated switch
+// statement work.
+namespace X86 {
+ enum {
+ BX_SI = 500,
+ BX_DI = 501,
+ BP_SI = 502,
+ BP_DI = 503,
+ sib = 504,
+ sib64 = 505
+ };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static bool translateInstruction(MCInst &target,
+ InternalInstruction &source,
+ const MCDisassembler *Dis);
+
+X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
+ DisassemblerMode mode,
+ const MCInstrInfo *MII)
+ : MCDisassembler(STI), MII(MII), fMode(mode) {}
+
+X86GenericDisassembler::~X86GenericDisassembler() {
+ delete MII;
+}
+
+/// regionReader - a callback function that wraps the readByte method from
+/// MemoryObject.
+///
+/// @param arg - The generic callback parameter. In this case, this should
+/// be a pointer to a MemoryObject.
+/// @param byte - A pointer to the byte to be read.
+/// @param address - The address to be read.
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+ const MemoryObject* region = static_cast<const MemoryObject*>(arg);
+ return region->readByte(address, byte);
+}
+
+/// logger - a callback function that wraps the operator<< method from
+/// raw_ostream.
+///
+/// @param arg - The generic callback parameter. This should be a pointe
+/// to a raw_ostream.
+/// @param log - A string to be logged. logger() adds a newline.
+static void logger(void* arg, const char* log) {
+ if (!arg)
+ return;
+
+ raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+ vStream << log << "\n";
+}
+
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus
+X86GenericDisassembler::getInstruction(MCInst &instr,
+ uint64_t &size,
+ const MemoryObject &region,
+ uint64_t address,
+ raw_ostream &vStream,
+ raw_ostream &cStream) const {
+ CommentStream = &cStream;
+
+ InternalInstruction internalInstr;
+
+ dlog_t loggerFn = logger;
+ if (&vStream == &nulls())
+ loggerFn = 0; // Disable logging completely if it's going to nulls().
+
+ int ret = decodeInstruction(&internalInstr,
+ regionReader,
+ (const void*)&region,
+ loggerFn,
+ (void*)&vStream,
+ (const void*)MII,
+ address,
+ fMode);
+
+ if (ret) {
+ size = internalInstr.readerCursor - address;
+ return Fail;
+ }
+ else {
+ size = internalInstr.length;
+ return (!translateInstruction(instr, internalInstr, this)) ?
+ Success : Fail;
+ }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+/// register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param reg - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+ uint8_t llvmRegnums[] = {
+ ALL_REGS
+ 0
+ };
+#undef ENTRY
+
+ uint8_t llvmRegnum = llvmRegnums[reg];
+ mcInst.addOperand(MCOperand::CreateReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value - The immediate Value, has had any PC adjustment made by
+/// the caller.
+/// @param isBranch - If the instruction is a branch instruction
+/// @param Address - The starting address of the instruction
+/// @param Offset - The byte offset to this immediate in the instruction
+/// @param Width - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width. If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created. This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Dis) {
+ LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
+ struct LLVMOpInfo1 SymbolicOp;
+ memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+ SymbolicOp.Value = Value;
+ void *DisInfo = Dis->getDisInfoBlock();
+
+ if (!getOpInfo ||
+ !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) {
+ // Clear SymbolicOp.Value from above and also all other fields.
+ memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+ LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+ if (!SymbolLookUp)
+ return false;
+ uint64_t ReferenceType;
+ if (isBranch)
+ ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+ else
+ ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+ const char *ReferenceName;
+ const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+ &ReferenceName);
+ if (Name) {
+ SymbolicOp.AddSymbol.Name = Name;
+ SymbolicOp.AddSymbol.Present = true;
+ }
+ // For branches always create an MCExpr so it gets printed as hex address.
+ else if (isBranch) {
+ SymbolicOp.Value = Value;
+ }
+ if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+ (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
+ if (!Name && !isBranch)
+ return false;
+ }
+
+ MCContext *Ctx = Dis->getMCContext();
+ const MCExpr *Add = NULL;
+ if (SymbolicOp.AddSymbol.Present) {
+ if (SymbolicOp.AddSymbol.Name) {
+ StringRef Name(SymbolicOp.AddSymbol.Name);
+ MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+ Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+ } else {
+ Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx);
+ }
+ }
+
+ const MCExpr *Sub = NULL;
+ if (SymbolicOp.SubtractSymbol.Present) {
+ if (SymbolicOp.SubtractSymbol.Name) {
+ StringRef Name(SymbolicOp.SubtractSymbol.Name);
+ MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+ Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+ } else {
+ Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx);
+ }
+ }
+
+ const MCExpr *Off = NULL;
+ if (SymbolicOp.Value != 0)
+ Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+ const MCExpr *Expr;
+ if (Sub) {
+ const MCExpr *LHS;
+ if (Add)
+ LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+ else
+ LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+ if (Off != 0)
+ Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+ else
+ Expr = LHS;
+ } else if (Add) {
+ if (Off != 0)
+ Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+ else
+ Expr = Add;
+ } else {
+ if (Off != 0)
+ Expr = Off;
+ else
+ Expr = MCConstantExpr::Create(0, *Ctx);
+ }
+
+ MI.addOperand(MCOperand::CreateExpr(Expr));
+
+ return true;
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool. The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry. The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+ if (SymbolLookUp) {
+ void *DisInfo = Dis->getDisInfoBlock();
+ uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+ const char *ReferenceName;
+ (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+ if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+ (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
+ }
+}
+
+/// translateImmediate - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param immediate - The immediate value to append.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+ const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Sign-extend the immediate if necessary.
+
+ OperandType type = (OperandType)operand.type;
+
+ bool isBranch = false;
+ uint64_t pcrel = 0;
+ if (type == TYPE_RELv) {
+ isBranch = true;
+ pcrel = insn.startLocation +
+ insn.immediateOffset + insn.immediateSize;
+ switch (insn.displacementSize) {
+ default:
+ break;
+ case 1:
+ type = TYPE_MOFFS8;
+ break;
+ case 2:
+ type = TYPE_MOFFS16;
+ break;
+ case 4:
+ type = TYPE_MOFFS32;
+ break;
+ case 8:
+ type = TYPE_MOFFS64;
+ break;
+ }
+ }
+ // By default sign-extend all X86 immediates based on their encoding.
+ else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+ type == TYPE_IMM64) {
+ uint32_t Opcode = mcInst.getOpcode();
+ switch (operand.encoding) {
+ default:
+ break;
+ case ENCODING_IB:
+ // Special case those X86 instructions that use the imm8 as a set of
+ // bits, bit count, etc. and are not sign-extend.
+ if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
+ Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+ Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+ Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+ Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+ Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+ Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+ Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+ Opcode != X86::VINSERTPSrr)
+ type = TYPE_MOFFS8;
+ break;
+ case ENCODING_IW:
+ type = TYPE_MOFFS16;
+ break;
+ case ENCODING_ID:
+ type = TYPE_MOFFS32;
+ break;
+ case ENCODING_IO:
+ type = TYPE_MOFFS64;
+ break;
+ }
+ }
+
+ switch (type) {
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
+ return;
+ case TYPE_XMM256:
+ mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
+ return;
+ case TYPE_REL8:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ // fall through to sign extend the immediate if needed.
+ case TYPE_MOFFS8:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case TYPE_MOFFS16:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case TYPE_REL32:
+ case TYPE_REL64:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ // fall through to sign extend the immediate if needed.
+ case TYPE_MOFFS32:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case TYPE_MOFFS64:
+ default:
+ // operand is 64 bits wide. Do nothing.
+ break;
+ }
+
+ if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+ insn.immediateOffset, insn.immediateSize,
+ mcInst, Dis))
+ mcInst.addOperand(MCOperand::CreateImm(immediate));
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+/// ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction to extract the R/M field
+/// from.
+/// @return - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+ InternalInstruction &insn) {
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ debug("A R/M register operand may not have a SIB byte");
+ return true;
+ }
+
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected EA base register");
+ return true;
+ case EA_BASE_NONE:
+ debug("EA_BASE_NONE for ModR/M base");
+ return true;
+#define ENTRY(x) case EA_BASE_##x:
+ ALL_EA_BASES
+#undef ENTRY
+ debug("A R/M register operand may not have a base; "
+ "the operand must be a register.");
+ return true;
+#define ENTRY(x) \
+ case EA_REG_##x: \
+ mcInst.addOperand(MCOperand::CreateReg(X86::x)); break;
+ ALL_REGS
+#undef ENTRY
+ }
+
+ return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+/// fields of an internal instruction (and possibly its SIB byte) to a memory
+/// operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Addresses in an MCInst are represented as five operands:
+ // 1. basereg (register) The R/M base, or (if there is a SIB) the
+ // SIB base
+ // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified
+ // scale amount
+ // 3. indexreg (register) x86_registerNONE, or (if there is a SIB)
+ // the index (which is multiplied by the
+ // scale amount)
+ // 4. displacement (immediate) 0, or the displacement if there is one
+ // 5. segmentreg (register) x86_registerNONE for now, but could be set
+ // if we have segment overrides
+
+ MCOperand baseReg;
+ MCOperand scaleAmount;
+ MCOperand indexReg;
+ MCOperand displacement;
+ MCOperand segmentReg;
+ uint64_t pcrel = 0;
+
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ if (insn.sibBase != SIB_BASE_NONE) {
+ switch (insn.sibBase) {
+ default:
+ debug("Unexpected sibBase");
+ return true;
+#define ENTRY(x) \
+ case SIB_BASE_##x: \
+ baseReg = MCOperand::CreateReg(X86::x); break;
+ ALL_SIB_BASES
+#undef ENTRY
+ }
+ } else {
+ baseReg = MCOperand::CreateReg(0);
+ }
+
+ // Check whether we are handling VSIB addressing mode for GATHER.
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+ // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+ // I don't see a way to get the correct IndexReg in readSIB:
+ // We can tell whether it is VSIB or SIB after instruction ID is decoded,
+ // but instruction ID may not be decoded yet when calling readSIB.
+ uint32_t Opcode = mcInst.getOpcode();
+ bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+ Opcode == X86::VGATHERDPDYrm ||
+ Opcode == X86::VGATHERQPDrm ||
+ Opcode == X86::VGATHERDPSrm ||
+ Opcode == X86::VGATHERQPSrm ||
+ Opcode == X86::VPGATHERDQrm ||
+ Opcode == X86::VPGATHERDQYrm ||
+ Opcode == X86::VPGATHERQQrm ||
+ Opcode == X86::VPGATHERDDrm ||
+ Opcode == X86::VPGATHERQDrm);
+ bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+ Opcode == X86::VGATHERDPSYrm ||
+ Opcode == X86::VGATHERQPSYrm ||
+ Opcode == X86::VPGATHERQQYrm ||
+ Opcode == X86::VPGATHERDDYrm ||
+ Opcode == X86::VPGATHERQDYrm);
+ if (IndexIs128 || IndexIs256) {
+ unsigned IndexOffset = insn.sibIndex -
+ (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+ SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+ insn.sibIndex = (SIBIndex)(IndexBase +
+ (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+ }
+
+ if (insn.sibIndex != SIB_INDEX_NONE) {
+ switch (insn.sibIndex) {
+ default:
+ debug("Unexpected sibIndex");
+ return true;
+#define ENTRY(x) \
+ case SIB_INDEX_##x: \
+ indexReg = MCOperand::CreateReg(X86::x); break;
+ EA_BASES_32BIT
+ EA_BASES_64BIT
+ REGS_XMM
+ REGS_YMM
+#undef ENTRY
+ }
+ } else {
+ indexReg = MCOperand::CreateReg(0);
+ }
+
+ scaleAmount = MCOperand::CreateImm(insn.sibScale);
+ } else {
+ switch (insn.eaBase) {
+ case EA_BASE_NONE:
+ if (insn.eaDisplacement == EA_DISP_NONE) {
+ debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+ return true;
+ }
+ if (insn.mode == MODE_64BIT){
+ pcrel = insn.startLocation +
+ insn.displacementOffset + insn.displacementSize;
+ tryAddingPcLoadReferenceComment(insn.startLocation +
+ insn.displacementOffset,
+ insn.displacement + pcrel, Dis);
+ baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6
+ }
+ else
+ baseReg = MCOperand::CreateReg(0);
+
+ indexReg = MCOperand::CreateReg(0);
+ break;
+ case EA_BASE_BX_SI:
+ baseReg = MCOperand::CreateReg(X86::BX);
+ indexReg = MCOperand::CreateReg(X86::SI);
+ break;
+ case EA_BASE_BX_DI:
+ baseReg = MCOperand::CreateReg(X86::BX);
+ indexReg = MCOperand::CreateReg(X86::DI);
+ break;
+ case EA_BASE_BP_SI:
+ baseReg = MCOperand::CreateReg(X86::BP);
+ indexReg = MCOperand::CreateReg(X86::SI);
+ break;
+ case EA_BASE_BP_DI:
+ baseReg = MCOperand::CreateReg(X86::BP);
+ indexReg = MCOperand::CreateReg(X86::DI);
+ break;
+ default:
+ indexReg = MCOperand::CreateReg(0);
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected eaBase");
+ return true;
+ // Here, we will use the fill-ins defined above. However,
+ // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+ // sib and sib64 were handled in the top-level if, so they're only
+ // placeholders to keep the compiler happy.
+#define ENTRY(x) \
+ case EA_BASE_##x: \
+ baseReg = MCOperand::CreateReg(X86::x); break;
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+ ALL_REGS
+#undef ENTRY
+ debug("A R/M memory operand may not be a register; "
+ "the base field must be a base.");
+ return true;
+ }
+ }
+
+ scaleAmount = MCOperand::CreateImm(1);
+ }
+
+ displacement = MCOperand::CreateImm(insn.displacement);
+
+ static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+ 0, // SEG_OVERRIDE_NONE
+ X86::CS,
+ X86::SS,
+ X86::DS,
+ X86::ES,
+ X86::FS,
+ X86::GS
+ };
+
+ segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+
+ mcInst.addOperand(baseReg);
+ mcInst.addOperand(scaleAmount);
+ mcInst.addOperand(indexReg);
+ if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+ insn.startLocation, insn.displacementOffset,
+ insn.displacementSize, mcInst, Dis))
+ mcInst.addOperand(displacement);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+/// byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn, const MCDisassembler *Dis) {
+ switch (operand.type) {
+ default:
+ debug("Unexpected type for a R/M operand");
+ return true;
+ case TYPE_R8:
+ case TYPE_R16:
+ case TYPE_R32:
+ case TYPE_R64:
+ case TYPE_Rv:
+ case TYPE_MM:
+ case TYPE_MM32:
+ case TYPE_MM64:
+ case TYPE_XMM:
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ case TYPE_XMM256:
+ case TYPE_DEBUGREG:
+ case TYPE_CONTROLREG:
+ return translateRMRegister(mcInst, insn);
+ case TYPE_M:
+ case TYPE_M8:
+ case TYPE_M16:
+ case TYPE_M32:
+ case TYPE_M64:
+ case TYPE_M128:
+ case TYPE_M256:
+ case TYPE_M512:
+ case TYPE_Mv:
+ case TYPE_M32FP:
+ case TYPE_M64FP:
+ case TYPE_M80FP:
+ case TYPE_M16INT:
+ case TYPE_M32INT:
+ case TYPE_M64INT:
+ case TYPE_M1616:
+ case TYPE_M1632:
+ case TYPE_M1664:
+ case TYPE_LEA:
+ return translateRMMemory(mcInst, insn, Dis);
+ }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param stackPos - The stack position to translate.
+/// @return - 0 on success; nonzero otherwise.
+static bool translateFPRegister(MCInst &mcInst,
+ uint8_t stackPos) {
+ if (stackPos >= 8) {
+ debug("Invalid FP stack position");
+ return true;
+ }
+
+ mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
+
+ return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+/// to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ switch (operand.encoding) {
+ default:
+ debug("Unhandled operand encoding during translation");
+ return true;
+ case ENCODING_REG:
+ translateRegister(mcInst, insn.reg);
+ return false;
+ case ENCODING_RM:
+ return translateRM(mcInst, operand, insn, Dis);
+ case ENCODING_CB:
+ case ENCODING_CW:
+ case ENCODING_CD:
+ case ENCODING_CP:
+ case ENCODING_CO:
+ case ENCODING_CT:
+ debug("Translation of code offsets isn't supported.");
+ return true;
+ case ENCODING_IB:
+ case ENCODING_IW:
+ case ENCODING_ID:
+ case ENCODING_IO:
+ case ENCODING_Iv:
+ case ENCODING_Ia:
+ translateImmediate(mcInst,
+ insn.immediates[insn.numImmediatesTranslated++],
+ operand,
+ insn,
+ Dis);
+ return false;
+ case ENCODING_RB:
+ case ENCODING_RW:
+ case ENCODING_RD:
+ case ENCODING_RO:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_I:
+ return translateFPRegister(mcInst, insn.opcodeModifier);
+ case ENCODING_Rv:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_VVVV:
+ translateRegister(mcInst, insn.vvvv);
+ return false;
+ case ENCODING_DUP:
+ return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+ insn, Dis);
+ }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+/// operands to an MCInst.
+///
+/// @param mcInst - The MCInst to populate with the instruction's data.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ if (!insn.spec) {
+ debug("Instruction has no specification");
+ return true;
+ }
+
+ mcInst.setOpcode(insn.instructionID);
+
+ int index;
+
+ insn.numImmediatesTranslated = 0;
+
+ for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+ if (insn.operands[index].encoding != ENCODING_NONE) {
+ if (translateOperand(mcInst, insn.operands[index], insn, Dis)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static MCDisassembler *createX86_32Disassembler(const Target &T,
+ const MCSubtargetInfo &STI) {
+ return new X86Disassembler::X86GenericDisassembler(STI, MODE_32BIT,
+ T.createMCInstrInfo());
+}
+
+static MCDisassembler *createX86_64Disassembler(const Target &T,
+ const MCSubtargetInfo &STI) {
+ return new X86Disassembler::X86GenericDisassembler(STI, MODE_64BIT,
+ T.createMCInstrInfo());
+}
+
+extern "C" void LLVMInitializeX86Disassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(TheX86_32Target,
+ createX86_32Disassembler);
+ TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+ createX86_64Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 0000000..b92427a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -0,0 +1,131 @@
+//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets. The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+// These attributes, recorded in enum attributeBits
+// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM
+// provides a mapping from bitmasks to contexts, which are represented by
+// enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is. The
+// disassembler distinguishes four kinds of opcodes, which are enumerated in
+// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which
+// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get
+// a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+// ModR/M byte to complete decode. The ModRMDecision's type is an entry from
+// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+// ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in
+// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+// meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+// (X86DisassemblerDecoderCommon.h) and its type is an entry from
+// OperandType (ibid.). The encoding indicates how to read it from the
+// instruction; the type indicates how to interpret the value once it has
+// been read. For example, a register operand could be stored in the R/M
+// field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+// the main opcode. This is orthogonal from its meaning (an GPR or an XMM
+// register, for instance). Given this information, the operands can be
+// extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+// and operands into a format understandable by the client - in this case, an
+// MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself. The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+// adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+// invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+// table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+// factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+// responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLER_H
+#define X86DISASSEMBLER_H
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+ uint16_t operands;
+
+#define INSTRUCTION_IDS \
+ uint16_t instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MemoryObject;
+class raw_ostream;
+
+namespace X86Disassembler {
+
+/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
+/// All each platform class should have to do is subclass the constructor, and
+/// provide a different disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+ const MCInstrInfo *MII;
+public:
+ /// Constructor - Initializes the disassembler.
+ ///
+ /// @param mode - The X86 architecture mode to decode for.
+ X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode,
+ const MCInstrInfo *MII);
+private:
+ ~X86GenericDisassembler();
+public:
+
+ /// getInstruction - See MCDisassembler.
+ DecodeStatus getInstruction(MCInst &instr,
+ uint64_t &size,
+ const MemoryObject &region,
+ uint64_t address,
+ raw_ostream &vStream,
+ raw_ostream &cStream) const;
+
+private:
+ DisassemblerMode fMode;
+};
+
+} // namespace X86Disassembler
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
new file mode 100644
index 0000000..e40edba
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -0,0 +1,1676 @@
+/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===*
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the implementation of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include <stdarg.h> /* for va_*() */
+#include <stdio.h> /* for vsnprintf() */
+#include <stdlib.h> /* for exit() */
+#include <string.h> /* for memset() */
+
+#include "X86DisassemblerDecoder.h"
+
+#include "X86GenDisassemblerTables.inc"
+
+#define TRUE 1
+#define FALSE 0
+
+typedef int8_t bool;
+
+#ifndef NDEBUG
+#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table. Takes a set of
+ * attributes and returns the appropriate decode context.
+ *
+ * @param attrMask - Attributes, from the enumeration attributeBits.
+ * @return - The InstructionContext to use when looking up an
+ * an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint8_t attrMask) {
+ return CONTEXTS_SYM[attrMask];
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ * the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ * contextForAttrs.
+ * @param opcode - The last byte of the instruction's opcode, not counting
+ * ModR/M extensions and escapes.
+ * @return - TRUE if the ModR/M byte is required, FALSE otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+ InstructionContext insnContext,
+ uint8_t opcode) {
+ const struct ContextDecision* decision = 0;
+
+ switch (type) {
+ case ONEBYTE:
+ decision = &ONEBYTE_SYM;
+ break;
+ case TWOBYTE:
+ decision = &TWOBYTE_SYM;
+ break;
+ case THREEBYTE_38:
+ decision = &THREEBYTE38_SYM;
+ break;
+ case THREEBYTE_3A:
+ decision = &THREEBYTE3A_SYM;
+ break;
+ case THREEBYTE_A6:
+ decision = &THREEBYTEA6_SYM;
+ break;
+ case THREEBYTE_A7:
+ decision = &THREEBYTEA7_SYM;
+ break;
+ }
+
+ return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+ modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ * an instruction.
+ *
+ * @param type - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode - See modRMRequired().
+ * @param modRM - The ModR/M byte if required, or any value if not.
+ * @return - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+ InstructionContext insnContext,
+ uint8_t opcode,
+ uint8_t modRM) {
+ const struct ModRMDecision* dec = 0;
+
+ switch (type) {
+ case ONEBYTE:
+ dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case TWOBYTE:
+ dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_38:
+ dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_3A:
+ dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_A6:
+ dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_A7:
+ dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ }
+
+ switch (dec->modrm_type) {
+ default:
+ debug("Corrupt table! Unknown modrm_type");
+ return 0;
+ case MODRM_ONEENTRY:
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITRM:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+1];
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITREG:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+ case MODRM_SPLITMISC:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+ case MODRM_FULL:
+ return modRMTable[dec->instructionIDs+modRM];
+ }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ * that instruction.
+ *
+ * @param uid - The unique ID for the instruction. This should be returned by
+ * decode(); specifierForUID will not check bounds.
+ * @return - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+ return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ * byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn - The instruction with the reader function to use. The cursor
+ * for this instruction is advanced.
+ * @param byte - A pointer to a pre-allocated memory buffer to be populated
+ * with the data read.
+ * @return - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+ int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+ if (!ret)
+ ++(insn->readerCursor);
+
+ return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn - See consumeByte().
+ * @param byte - See consumeByte().
+ * @return - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+ return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+ insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type) \
+ static int name(struct InternalInstruction* insn, type* ptr) { \
+ type combined = 0; \
+ unsigned offset; \
+ for (offset = 0; offset < sizeof(type); ++offset) { \
+ uint8_t byte; \
+ int ret = insn->reader(insn->readerArg, \
+ &byte, \
+ insn->readerCursor + offset); \
+ if (ret) \
+ return ret; \
+ combined = combined | ((uint64_t)byte << (offset * 8)); \
+ } \
+ *ptr = combined; \
+ insn->readerCursor += sizeof(type); \
+ return 0; \
+ }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ * values of various sizes from the instruction's memory and advance the
+ * cursor appropriately. These readers perform endian conversion.
+ *
+ * @param insn - See consumeByte().
+ * @param ptr - A pointer to a pre-allocated memory of appropriate size to
+ * be populated with the data read.
+ * @return - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ * message, typically without a carriage-return.
+ *
+ * @param insn - The instruction containing the logging function.
+ * @param format - See printf().
+ * @param ... - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+ const char* format,
+ ...) {
+ char buffer[256];
+ va_list ap;
+
+ if (!insn->dlog)
+ return;
+
+ va_start(ap, format);
+ (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+ va_end(ap);
+
+ insn->dlog(insn->dlogArg, buffer);
+
+ return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ * location.
+ *
+ * @param insn - The instruction to be marked as having the prefix.
+ * @param prefix - The prefix that is present.
+ * @param location - The location where the prefix is located (in the address
+ * space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ insn->prefixPresent[prefix] = 1;
+ insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ * present at a given location.
+ *
+ * @param insn - The instruction to be queried.
+ * @param prefix - The prefix.
+ * @param location - The location to query.
+ * @return - Whether the prefix is at that location.
+ */
+static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ if (insn->prefixPresent[prefix] == 1 &&
+ insn->prefixLocations[prefix] == location)
+ return TRUE;
+ else
+ return FALSE;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ * instruction as having them. Also sets the instruction's default operand,
+ * address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn - The instruction whose prefixes are to be read.
+ * @return - 0 if the instruction could be read until the end of the prefix
+ * bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+ BOOL isPrefix = TRUE;
+ BOOL prefixGroups[4] = { FALSE };
+ uint64_t prefixLocation;
+ uint8_t byte = 0;
+
+ BOOL hasAdSize = FALSE;
+ BOOL hasOpSize = FALSE;
+
+ dbgprintf(insn, "readPrefixes()");
+
+ while (isPrefix) {
+ prefixLocation = insn->readerCursor;
+
+ if (consumeByte(insn, &byte))
+ return -1;
+
+ /*
+ * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+ * break and let it be disassembled as a normal "instruction".
+ */
+ if (insn->readerCursor - 1 == insn->startLocation
+ && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) {
+ uint8_t nextByte;
+ if (byte == 0xf0)
+ break;
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+ if (consumeByte(insn, &nextByte))
+ return -1;
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ unconsumeByte(insn);
+ }
+ if (nextByte != 0x0f && nextByte != 0x90)
+ break;
+ }
+
+ switch (byte) {
+ case 0xf0: /* LOCK */
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP or REPE/REPZ */
+ if (prefixGroups[0])
+ dbgprintf(insn, "Redundant Group 1 prefix");
+ prefixGroups[0] = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x2e: /* CS segment override -OR- Branch not taken */
+ case 0x36: /* SS segment override -OR- Branch taken */
+ case 0x3e: /* DS segment override */
+ case 0x26: /* ES segment override */
+ case 0x64: /* FS segment override */
+ case 0x65: /* GS segment override */
+ switch (byte) {
+ case 0x2e:
+ insn->segmentOverride = SEG_OVERRIDE_CS;
+ break;
+ case 0x36:
+ insn->segmentOverride = SEG_OVERRIDE_SS;
+ break;
+ case 0x3e:
+ insn->segmentOverride = SEG_OVERRIDE_DS;
+ break;
+ case 0x26:
+ insn->segmentOverride = SEG_OVERRIDE_ES;
+ break;
+ case 0x64:
+ insn->segmentOverride = SEG_OVERRIDE_FS;
+ break;
+ case 0x65:
+ insn->segmentOverride = SEG_OVERRIDE_GS;
+ break;
+ default:
+ debug("Unhandled override");
+ return -1;
+ }
+ if (prefixGroups[1])
+ dbgprintf(insn, "Redundant Group 2 prefix");
+ prefixGroups[1] = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x66: /* Operand-size override */
+ if (prefixGroups[2])
+ dbgprintf(insn, "Redundant Group 3 prefix");
+ prefixGroups[2] = TRUE;
+ hasOpSize = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x67: /* Address-size override */
+ if (prefixGroups[3])
+ dbgprintf(insn, "Redundant Group 4 prefix");
+ prefixGroups[3] = TRUE;
+ hasAdSize = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ default: /* Not a prefix byte */
+ isPrefix = FALSE;
+ break;
+ }
+
+ if (isPrefix)
+ dbgprintf(insn, "Found prefix 0x%hhx", byte);
+ }
+
+ insn->vexSize = 0;
+
+ if (byte == 0xc4) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ insn->vexSize = 3;
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+
+ if (insn->vexSize == 3) {
+ insn->vexPrefix[0] = byte;
+ consumeByte(insn, &insn->vexPrefix[1]);
+ consumeByte(insn, &insn->vexPrefix[2]);
+
+ /* We simulate the REX prefix for simplicity's sake */
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
+ | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
+ | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
+ | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+ }
+
+ switch (ppFromVEX3of3(insn->vexPrefix[2]))
+ {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ hasOpSize = TRUE;
+ break;
+ }
+
+ dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
+ }
+ }
+ else if (byte == 0xc5) {
+ uint8_t byte1;
+
+ if (lookAtByte(insn, &byte1)) {
+ dbgprintf(insn, "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ insn->vexSize = 2;
+ }
+ else {
+ unconsumeByte(insn);
+ }
+
+ if (insn->vexSize == 2) {
+ insn->vexPrefix[0] = byte;
+ consumeByte(insn, &insn->vexPrefix[1]);
+
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40
+ | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+ }
+
+ switch (ppFromVEX2of2(insn->vexPrefix[1]))
+ {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ hasOpSize = TRUE;
+ break;
+ }
+
+ dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
+ }
+ }
+ else {
+ if (insn->mode == MODE_64BIT) {
+ if ((byte & 0xf0) == 0x40) {
+ uint8_t opcodeByte;
+
+ if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+ dbgprintf(insn, "Redundant REX prefix");
+ return -1;
+ }
+
+ insn->rexPrefix = byte;
+ insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+ dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ }
+
+ if (insn->mode == MODE_16BIT) {
+ insn->registerSize = (hasOpSize ? 4 : 2);
+ insn->addressSize = (hasAdSize ? 4 : 2);
+ insn->displacementSize = (hasAdSize ? 4 : 2);
+ insn->immediateSize = (hasOpSize ? 4 : 2);
+ } else if (insn->mode == MODE_32BIT) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 2 : 4);
+ insn->displacementSize = (hasAdSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else if (insn->mode == MODE_64BIT) {
+ if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+ insn->registerSize = 8;
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = 4;
+ insn->immediateSize = 4;
+ } else if (insn->rexPrefix) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ * extended or escape opcodes).
+ *
+ * @param insn - The instruction whose opcode is to be read.
+ * @return - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+ /* Determine the length of the primary opcode */
+
+ uint8_t current;
+
+ dbgprintf(insn, "readOpcode()");
+
+ insn->opcodeType = ONEBYTE;
+
+ if (insn->vexSize == 3)
+ {
+ switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
+ {
+ default:
+ dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
+ return -1;
+ case 0:
+ break;
+ case VEX_LOB_0F:
+ insn->twoByteEscape = 0x0f;
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F38:
+ insn->twoByteEscape = 0x0f;
+ insn->threeByteEscape = 0x38;
+ insn->opcodeType = THREEBYTE_38;
+ return consumeByte(insn, &insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->twoByteEscape = 0x0f;
+ insn->threeByteEscape = 0x3a;
+ insn->opcodeType = THREEBYTE_3A;
+ return consumeByte(insn, &insn->opcode);
+ }
+ }
+ else if (insn->vexSize == 2)
+ {
+ insn->twoByteEscape = 0x0f;
+ insn->opcodeType = TWOBYTE;
+ return consumeByte(insn, &insn->opcode);
+ }
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x0f) {
+ dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+ insn->twoByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x38) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ insn->threeByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_38;
+ } else if (current == 0x3a) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ insn->threeByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_3A;
+ } else if (current == 0xa6) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ insn->threeByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_A6;
+ } else if (current == 0xa7) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ insn->threeByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_A7;
+ } else {
+ dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+ insn->opcodeType = TWOBYTE;
+ }
+ }
+
+ /*
+ * At this point we have consumed the full opcode.
+ * Anything we consume from here on must be unconsumed.
+ */
+
+ insn->opcode = current;
+
+ return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ * the ModR/M byte as appropriate for extended and escape opcodes,
+ * and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ * instruction.
+ * @param insn - The instruction whose ID is to be determined.
+ * @param attrMask - The attribute mask to search.
+ * @return - 0 if the ModR/M could be read when needed or was not
+ * needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+ struct InternalInstruction* insn,
+ uint8_t attrMask) {
+ BOOL hasModRMExtension;
+
+ uint8_t instructionClass;
+
+ instructionClass = contextForAttrs(attrMask);
+
+ hasModRMExtension = modRMRequired(insn->opcodeType,
+ instructionClass,
+ insn->opcode);
+
+ if (hasModRMExtension) {
+ if (readModRM(insn))
+ return -1;
+
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ insn->modRM);
+ } else {
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ 0);
+ }
+
+ return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
+ off_t i;
+
+ for (i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return TRUE;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return FALSE;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+ continue;
+ if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+ continue;
+ return FALSE;
+ }
+ }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ * appropriate for extended and escape opcodes. Determines the attributes and
+ * context for the instruction before doing so.
+ *
+ * @param insn - The instruction whose ID is to be determined.
+ * @return - 0 if the ModR/M could be read when needed or was not needed;
+ * nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+ uint8_t attrMask;
+ uint16_t instructionID;
+
+ dbgprintf(insn, "getID()");
+
+ attrMask = ATTR_NONE;
+
+ if (insn->mode == MODE_64BIT)
+ attrMask |= ATTR_64BIT;
+
+ if (insn->vexSize) {
+ attrMask |= ATTR_VEX;
+
+ if (insn->vexSize == 3) {
+ switch (ppFromVEX3of3(insn->vexPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX3of3(insn->vexPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ }
+ else if (insn->vexSize == 2) {
+ switch (ppFromVEX2of2(insn->vexPrefix[1])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX2of2(insn->vexPrefix[1]))
+ attrMask |= ATTR_VEXL;
+ }
+ else {
+ return -1;
+ }
+ }
+ else {
+ if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_OPSIZE;
+ else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_ADSIZE;
+ else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XS;
+ else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XD;
+ }
+
+ if (insn->rexPrefix & 0x08)
+ attrMask |= ATTR_REXW;
+
+ if (getIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ /* The following clauses compensate for limitations of the tables. */
+
+ if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) &&
+ !(attrMask & ATTR_OPSIZE)) {
+ /*
+ * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
+ * has precedence since there are no L-bit with W-bit entries in the tables.
+ * So if the L-bit isn't significant we should use the W-bit instead.
+ * We only need to do this if the instruction doesn't specify OpSize since
+ * there is a VEX_L_W_OPSIZE table.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithWBit;
+ const struct InstructionSpecifier *specWithWBit;
+
+ spec = specifierForUID(instructionID);
+
+ if (getIDWithAttrMask(&instructionIDWithWBit,
+ insn,
+ (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithWBit = specifierForUID(instructionIDWithWBit);
+
+ if (instructionID != instructionIDWithWBit) {
+ insn->instructionID = instructionIDWithWBit;
+ insn->spec = specWithWBit;
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
+ /*
+ * The instruction tables make no distinction between instructions that
+ * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+ * particular spot (i.e., many MMX operations). In general we're
+ * conservative, but in the specific case where OpSize is present but not
+ * in the right place we check if there's a 16-bit operation.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithOpsize;
+ const char *specName, *specWithOpSizeName;
+
+ spec = specifierForUID(instructionID);
+
+ if (getIDWithAttrMask(&instructionIDWithOpsize,
+ insn,
+ attrMask | ATTR_OPSIZE)) {
+ /*
+ * ModRM required with OpSize but not present; give up and return version
+ * without OpSize set
+ */
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specName = x86DisassemblerGetInstrName(instructionID, miiArg);
+ specWithOpSizeName =
+ x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
+
+ if (is16BitEquivalent(specName, specWithOpSizeName)) {
+ insn->instructionID = instructionIDWithOpsize;
+ insn->spec = specifierForUID(instructionIDWithOpsize);
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+ insn->rexPrefix & 0x01) {
+ /*
+ * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+ * it should decode as XCHG %r8, %eax.
+ */
+
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithNewOpcode;
+ const struct InstructionSpecifier *specWithNewOpcode;
+
+ spec = specifierForUID(instructionID);
+
+ /* Borrow opcode from one of the other XCHGar opcodes */
+ insn->opcode = 0x91;
+
+ if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+ insn,
+ attrMask)) {
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+ /* Change back */
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionIDWithNewOpcode;
+ insn->spec = specWithNewOpcode;
+
+ return 0;
+ }
+
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(insn->instructionID);
+
+ return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ * instruction.
+ *
+ * @param insn - The instruction whose SIB byte is to be read.
+ * @return - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+ SIBIndex sibIndexBase = 0;
+ SIBBase sibBaseBase = 0;
+ uint8_t index, base;
+
+ dbgprintf(insn, "readSIB()");
+
+ if (insn->consumedSIB)
+ return 0;
+
+ insn->consumedSIB = TRUE;
+
+ switch (insn->addressSize) {
+ case 2:
+ dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+ return -1;
+ break;
+ case 4:
+ sibIndexBase = SIB_INDEX_EAX;
+ sibBaseBase = SIB_BASE_EAX;
+ break;
+ case 8:
+ sibIndexBase = SIB_INDEX_RAX;
+ sibBaseBase = SIB_BASE_RAX;
+ break;
+ }
+
+ if (consumeByte(insn, &insn->sib))
+ return -1;
+
+ index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+ switch (index) {
+ case 0x4:
+ insn->sibIndex = SIB_INDEX_NONE;
+ break;
+ default:
+ insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+ if (insn->sibIndex == SIB_INDEX_sib ||
+ insn->sibIndex == SIB_INDEX_sib64)
+ insn->sibIndex = SIB_INDEX_NONE;
+ break;
+ }
+
+ switch (scaleFromSIB(insn->sib)) {
+ case 0:
+ insn->sibScale = 1;
+ break;
+ case 1:
+ insn->sibScale = 2;
+ break;
+ case 2:
+ insn->sibScale = 4;
+ break;
+ case 3:
+ insn->sibScale = 8;
+ break;
+ }
+
+ base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+ switch (base) {
+ case 0x5:
+ switch (modFromModRM(insn->modRM)) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = SIB_BASE_NONE;
+ break;
+ case 0x1:
+ insn->eaDisplacement = EA_DISP_8;
+ insn->sibBase = (insn->addressSize == 4 ?
+ SIB_BASE_EBP : SIB_BASE_RBP);
+ break;
+ case 0x2:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = (insn->addressSize == 4 ?
+ SIB_BASE_EBP : SIB_BASE_RBP);
+ break;
+ case 0x3:
+ debug("Cannot have Mod = 0b11 and a SIB byte");
+ return -1;
+ }
+ break;
+ default:
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn - The instruction whose displacement is to be read.
+ * @return - 0 if the displacement byte was successfully read; nonzero
+ * otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+ int8_t d8;
+ int16_t d16;
+ int32_t d32;
+
+ dbgprintf(insn, "readDisplacement()");
+
+ if (insn->consumedDisplacement)
+ return 0;
+
+ insn->consumedDisplacement = TRUE;
+ insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+ switch (insn->eaDisplacement) {
+ case EA_DISP_NONE:
+ insn->consumedDisplacement = FALSE;
+ break;
+ case EA_DISP_8:
+ if (consumeInt8(insn, &d8))
+ return -1;
+ insn->displacement = d8;
+ break;
+ case EA_DISP_16:
+ if (consumeInt16(insn, &d16))
+ return -1;
+ insn->displacement = d16;
+ break;
+ case EA_DISP_32:
+ if (consumeInt32(insn, &d32))
+ return -1;
+ insn->displacement = d32;
+ break;
+ }
+
+ insn->consumedDisplacement = TRUE;
+ return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ * displacement) for an instruction and interprets it.
+ *
+ * @param insn - The instruction whose addressing information is to be read.
+ * @return - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+ uint8_t mod, rm, reg;
+
+ dbgprintf(insn, "readModRM()");
+
+ if (insn->consumedModRM)
+ return 0;
+
+ if (consumeByte(insn, &insn->modRM))
+ return -1;
+ insn->consumedModRM = TRUE;
+
+ mod = modFromModRM(insn->modRM);
+ rm = rmFromModRM(insn->modRM);
+ reg = regFromModRM(insn->modRM);
+
+ /*
+ * This goes by insn->registerSize to pick the correct register, which messes
+ * up if we're using (say) XMM or 8-bit register operands. That gets fixed in
+ * fixupReg().
+ */
+ switch (insn->registerSize) {
+ case 2:
+ insn->regBase = MODRM_REG_AX;
+ insn->eaRegBase = EA_REG_AX;
+ break;
+ case 4:
+ insn->regBase = MODRM_REG_EAX;
+ insn->eaRegBase = EA_REG_EAX;
+ break;
+ case 8:
+ insn->regBase = MODRM_REG_RAX;
+ insn->eaRegBase = EA_REG_RAX;
+ break;
+ }
+
+ reg |= rFromREX(insn->rexPrefix) << 3;
+ rm |= bFromREX(insn->rexPrefix) << 3;
+
+ insn->reg = (Reg)(insn->regBase + reg);
+
+ switch (insn->addressSize) {
+ case 2:
+ insn->eaBaseBase = EA_BASE_BX_SI;
+
+ switch (mod) {
+ case 0x0:
+ if (rm == 0x6) {
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ } else {
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_NONE;
+ }
+ break;
+ case 0x1:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_8;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x2:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x3:
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 4:
+ case 8:
+ insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+ switch (mod) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+ switch (rm) {
+ case 0x4:
+ case 0xc: /* in case REXW.b is set */
+ insn->eaBase = (insn->addressSize == 4 ?
+ EA_BASE_sib : EA_BASE_sib64);
+ readSIB(insn);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x5:
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_32;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ break;
+ }
+ break;
+ case 0x1:
+ case 0x2:
+ insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+ switch (rm) {
+ case 0x4:
+ case 0xc: /* in case REXW.b is set */
+ insn->eaBase = EA_BASE_sib;
+ readSIB(insn);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 0x3:
+ insn->eaDisplacement = EA_DISP_NONE;
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ break;
+ }
+ break;
+ } /* switch (insn->addressSize) */
+
+ return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix) \
+ static uint8_t name(struct InternalInstruction *insn, \
+ OperandType type, \
+ uint8_t index, \
+ uint8_t *valid) { \
+ *valid = 1; \
+ switch (type) { \
+ default: \
+ debug("Unhandled register type"); \
+ *valid = 0; \
+ return 0; \
+ case TYPE_Rv: \
+ return base + index; \
+ case TYPE_R8: \
+ if (insn->rexPrefix && \
+ index >= 4 && index <= 7) { \
+ return prefix##_SPL + (index - 4); \
+ } else { \
+ return prefix##_AL + index; \
+ } \
+ case TYPE_R16: \
+ return prefix##_AX + index; \
+ case TYPE_R32: \
+ return prefix##_EAX + index; \
+ case TYPE_R64: \
+ return prefix##_RAX + index; \
+ case TYPE_XMM256: \
+ return prefix##_YMM0 + index; \
+ case TYPE_XMM128: \
+ case TYPE_XMM64: \
+ case TYPE_XMM32: \
+ case TYPE_XMM: \
+ return prefix##_XMM0 + index; \
+ case TYPE_MM64: \
+ case TYPE_MM32: \
+ case TYPE_MM: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_MM0 + index; \
+ case TYPE_SEGMENTREG: \
+ if (index > 5) \
+ *valid = 0; \
+ return prefix##_ES + index; \
+ case TYPE_DEBUGREG: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_DR0 + index; \
+ case TYPE_CONTROLREG: \
+ if (index > 8) \
+ *valid = 0; \
+ return prefix##_CR0 + index; \
+ } \
+ }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ * reg or R/M field. If the operand is an XMM operand, for example, an
+ * operand would be XMM0 instead of AX, which readModRM() would otherwise
+ * misinterpret it as.
+ *
+ * @param insn - The instruction containing the operand.
+ * @param type - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t. The target is set to 1 if the
+ * field is valid for the register class; 0 if not.
+ * @return - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ * fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn - See fixup*Value().
+ * @param op - The operand specifier.
+ * @return - 0 if fixup was successful; -1 if the register returned was
+ * invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+ const struct OperandSpecifier *op) {
+ uint8_t valid;
+
+ dbgprintf(insn, "fixupReg()");
+
+ switch ((OperandEncoding)op->encoding) {
+ default:
+ debug("Expected a REG or R/M encoding in fixupReg");
+ return -1;
+ case ENCODING_VVVV:
+ insn->vvvv = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->vvvv,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_REG:
+ insn->reg = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->reg - insn->regBase,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_RM:
+ if (insn->eaBase >= insn->eaRegBase) {
+ insn->eaBase = (EABase)fixupRMValue(insn,
+ (OperandType)op->type,
+ insn->eaBase - insn->eaRegBase,
+ &valid);
+ if (!valid)
+ return -1;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcodeModifier - Reads an operand from the opcode field of an
+ * instruction. Handles AddRegFrm instructions.
+ *
+ * @param insn - The instruction whose opcode field is to be read.
+ * @param inModRM - Indicates that the opcode field is to be read from the
+ * ModR/M extension; useful for escape opcodes
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeModifier(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readOpcodeModifier()");
+
+ if (insn->consumedOpcodeModifier)
+ return 0;
+
+ insn->consumedOpcodeModifier = TRUE;
+
+ switch (insn->spec->modifierType) {
+ default:
+ debug("Unknown modifier type.");
+ return -1;
+ case MODIFIER_NONE:
+ debug("No modifier but an operand expects one.");
+ return -1;
+ case MODIFIER_OPCODE:
+ insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
+ return 0;
+ case MODIFIER_MODRM:
+ insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
+ return 0;
+ }
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ * instruction and interprets it appropriately given the operand width.
+ * Handles AddRegFrm instructions.
+ *
+ * @param insn - See readOpcodeModifier().
+ * @param size - The width (in bytes) of the register being specified.
+ * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ * RAX.
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+ dbgprintf(insn, "readOpcodeRegister()");
+
+ if (readOpcodeModifier(insn))
+ return -1;
+
+ if (size == 0)
+ size = insn->registerSize;
+
+ switch (size) {
+ case 1:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ if (insn->rexPrefix &&
+ insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+ insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+ insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+ + (insn->opcodeRegister - MODRM_REG_AL - 4));
+ }
+
+ break;
+ case 2:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ break;
+ case 4:
+ insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ break;
+ case 8:
+ insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ * desired operand size.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @param size - The width (in bytes) of the operand.
+ * @return - 0 if the immediate was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+ uint8_t imm8;
+ uint16_t imm16;
+ uint32_t imm32;
+ uint64_t imm64;
+
+ dbgprintf(insn, "readImmediate()");
+
+ if (insn->numImmediatesConsumed == 2) {
+ debug("Already consumed two immediates");
+ return -1;
+ }
+
+ if (size == 0)
+ size = insn->immediateSize;
+ else
+ insn->immediateSize = size;
+ insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+ switch (size) {
+ case 1:
+ if (consumeByte(insn, &imm8))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm8;
+ break;
+ case 2:
+ if (consumeUInt16(insn, &imm16))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm16;
+ break;
+ case 4:
+ if (consumeUInt32(insn, &imm32))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm32;
+ break;
+ case 8:
+ if (consumeUInt64(insn, &imm64))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm64;
+ break;
+ }
+
+ insn->numImmediatesConsumed++;
+
+ return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @return - 0 if the vvvv was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readVVVV()");
+
+ if (insn->vexSize == 3)
+ insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
+ else if (insn->vexSize == 2)
+ insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]);
+ else
+ return -1;
+
+ if (insn->mode != MODE_64BIT)
+ insn->vvvv &= 0x7;
+
+ return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ * operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn - The instruction whose operands are to be read and interpreted.
+ * @return - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+ int index;
+ int hasVVVV, needVVVV;
+ int sawRegImm = 0;
+
+ dbgprintf(insn, "readOperands()");
+
+ /* If non-zero vvvv specified, need to make sure one of the operands
+ uses it. */
+ hasVVVV = !readVVVV(insn);
+ needVVVV = hasVVVV && (insn->vvvv != 0);
+
+ for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+ switch (x86OperandSets[insn->spec->operands][index].encoding) {
+ case ENCODING_NONE:
+ break;
+ case ENCODING_REG:
+ case ENCODING_RM:
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+ return -1;
+ break;
+ case ENCODING_CB:
+ case ENCODING_CW:
+ case ENCODING_CD:
+ case ENCODING_CP:
+ case ENCODING_CO:
+ case ENCODING_CT:
+ dbgprintf(insn, "We currently don't hande code-offset encodings");
+ return -1;
+ case ENCODING_IB:
+ if (sawRegImm) {
+ /* Saw a register immediate so don't read again and instead split the
+ previous immediate. FIXME: This is a hack. */
+ insn->immediates[insn->numImmediatesConsumed] =
+ insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+ ++insn->numImmediatesConsumed;
+ break;
+ }
+ if (readImmediate(insn, 1))
+ return -1;
+ if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 &&
+ insn->immediates[insn->numImmediatesConsumed - 1] > 7)
+ return -1;
+ if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 &&
+ insn->immediates[insn->numImmediatesConsumed - 1] > 31)
+ return -1;
+ if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 ||
+ x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256)
+ sawRegImm = 1;
+ break;
+ case ENCODING_IW:
+ if (readImmediate(insn, 2))
+ return -1;
+ break;
+ case ENCODING_ID:
+ if (readImmediate(insn, 4))
+ return -1;
+ break;
+ case ENCODING_IO:
+ if (readImmediate(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Iv:
+ if (readImmediate(insn, insn->immediateSize))
+ return -1;
+ break;
+ case ENCODING_Ia:
+ if (readImmediate(insn, insn->addressSize))
+ return -1;
+ break;
+ case ENCODING_RB:
+ if (readOpcodeRegister(insn, 1))
+ return -1;
+ break;
+ case ENCODING_RW:
+ if (readOpcodeRegister(insn, 2))
+ return -1;
+ break;
+ case ENCODING_RD:
+ if (readOpcodeRegister(insn, 4))
+ return -1;
+ break;
+ case ENCODING_RO:
+ if (readOpcodeRegister(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Rv:
+ if (readOpcodeRegister(insn, 0))
+ return -1;
+ break;
+ case ENCODING_I:
+ if (readOpcodeModifier(insn))
+ return -1;
+ break;
+ case ENCODING_VVVV:
+ needVVVV = 0; /* Mark that we have found a VVVV operand. */
+ if (!hasVVVV)
+ return -1;
+ if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+ return -1;
+ break;
+ case ENCODING_DUP:
+ break;
+ default:
+ dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+ return -1;
+ }
+ }
+
+ /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+ if (needVVVV) return -1;
+
+ return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ * user.
+ *
+ * @param insn - A pointer to the instruction to be populated. Must be
+ * pre-allocated.
+ * @param reader - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ * any internal state.
+ * @param logger - If non-NULL, the function to be used to write log messages
+ * and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ * any internal state.
+ * @param startLoc - The address (in the reader's address space) of the first
+ * byte in the instruction.
+ * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ * decode the instruction in.
+ * @return - 0 if the instruction's memory could be read; nonzero if
+ * not.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+ byteReader_t reader,
+ const void* readerArg,
+ dlog_t logger,
+ void* loggerArg,
+ const void* miiArg,
+ uint64_t startLoc,
+ DisassemblerMode mode) {
+ memset(insn, 0, sizeof(struct InternalInstruction));
+
+ insn->reader = reader;
+ insn->readerArg = readerArg;
+ insn->dlog = logger;
+ insn->dlogArg = loggerArg;
+ insn->startLocation = startLoc;
+ insn->readerCursor = startLoc;
+ insn->mode = mode;
+ insn->numImmediatesConsumed = 0;
+
+ if (readPrefixes(insn) ||
+ readOpcode(insn) ||
+ getID(insn, miiArg) ||
+ insn->instructionID == 0 ||
+ readOperands(insn))
+ return -1;
+
+ insn->operands = &x86OperandSets[insn->spec->operands][0];
+
+ insn->length = insn->readerCursor - insn->startLocation;
+
+ dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+ startLoc, insn->readerCursor, insn->length);
+
+ if (insn->length > 15)
+ dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 0000000..407ead3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,588 @@
+/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===*
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the public interface of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#ifndef X86DISASSEMBLERDECODER_H
+#define X86DISASSEMBLERDECODER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+ uint16_t operands;
+
+#define INSTRUCTION_IDS \
+ uint16_t instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+/*
+ * Accessor functions for various fields of an Intel instruction
+ */
+#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM) (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM) ((modRM) & 0x7)
+#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib) (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib) ((sib) & 0x7)
+#define wFromREX(rex) (((rex) & 0x8) >> 3)
+#define rFromREX(rex) (((rex) & 0x4) >> 2)
+#define xFromREX(rex) (((rex) & 0x2) >> 1)
+#define bFromREX(rex) ((rex) & 0x1)
+
+#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f)
+#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex) ((vex) & 0x3)
+
+#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex) ((vex) & 0x3)
+
+/*
+ * These enums represent Intel registers for use by the decoder.
+ */
+
+#define REGS_8BIT \
+ ENTRY(AL) \
+ ENTRY(CL) \
+ ENTRY(DL) \
+ ENTRY(BL) \
+ ENTRY(AH) \
+ ENTRY(CH) \
+ ENTRY(DH) \
+ ENTRY(BH) \
+ ENTRY(R8B) \
+ ENTRY(R9B) \
+ ENTRY(R10B) \
+ ENTRY(R11B) \
+ ENTRY(R12B) \
+ ENTRY(R13B) \
+ ENTRY(R14B) \
+ ENTRY(R15B) \
+ ENTRY(SPL) \
+ ENTRY(BPL) \
+ ENTRY(SIL) \
+ ENTRY(DIL)
+
+#define EA_BASES_16BIT \
+ ENTRY(BX_SI) \
+ ENTRY(BX_DI) \
+ ENTRY(BP_SI) \
+ ENTRY(BP_DI) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(BP) \
+ ENTRY(BX) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define REGS_16BIT \
+ ENTRY(AX) \
+ ENTRY(CX) \
+ ENTRY(DX) \
+ ENTRY(BX) \
+ ENTRY(SP) \
+ ENTRY(BP) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define EA_BASES_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(sib) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define REGS_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(ESP) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define EA_BASES_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(sib64) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(RSP) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_MMX \
+ ENTRY(MM0) \
+ ENTRY(MM1) \
+ ENTRY(MM2) \
+ ENTRY(MM3) \
+ ENTRY(MM4) \
+ ENTRY(MM5) \
+ ENTRY(MM6) \
+ ENTRY(MM7)
+
+#define REGS_XMM \
+ ENTRY(XMM0) \
+ ENTRY(XMM1) \
+ ENTRY(XMM2) \
+ ENTRY(XMM3) \
+ ENTRY(XMM4) \
+ ENTRY(XMM5) \
+ ENTRY(XMM6) \
+ ENTRY(XMM7) \
+ ENTRY(XMM8) \
+ ENTRY(XMM9) \
+ ENTRY(XMM10) \
+ ENTRY(XMM11) \
+ ENTRY(XMM12) \
+ ENTRY(XMM13) \
+ ENTRY(XMM14) \
+ ENTRY(XMM15)
+
+#define REGS_YMM \
+ ENTRY(YMM0) \
+ ENTRY(YMM1) \
+ ENTRY(YMM2) \
+ ENTRY(YMM3) \
+ ENTRY(YMM4) \
+ ENTRY(YMM5) \
+ ENTRY(YMM6) \
+ ENTRY(YMM7) \
+ ENTRY(YMM8) \
+ ENTRY(YMM9) \
+ ENTRY(YMM10) \
+ ENTRY(YMM11) \
+ ENTRY(YMM12) \
+ ENTRY(YMM13) \
+ ENTRY(YMM14) \
+ ENTRY(YMM15)
+
+#define REGS_SEGMENT \
+ ENTRY(ES) \
+ ENTRY(CS) \
+ ENTRY(SS) \
+ ENTRY(DS) \
+ ENTRY(FS) \
+ ENTRY(GS)
+
+#define REGS_DEBUG \
+ ENTRY(DR0) \
+ ENTRY(DR1) \
+ ENTRY(DR2) \
+ ENTRY(DR3) \
+ ENTRY(DR4) \
+ ENTRY(DR5) \
+ ENTRY(DR6) \
+ ENTRY(DR7)
+
+#define REGS_CONTROL \
+ ENTRY(CR0) \
+ ENTRY(CR1) \
+ ENTRY(CR2) \
+ ENTRY(CR3) \
+ ENTRY(CR4) \
+ ENTRY(CR5) \
+ ENTRY(CR6) \
+ ENTRY(CR7) \
+ ENTRY(CR8)
+
+#define ALL_EA_BASES \
+ EA_BASES_16BIT \
+ EA_BASES_32BIT \
+ EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+ REGS_32BIT \
+ REGS_64BIT
+
+#define ALL_REGS \
+ REGS_8BIT \
+ REGS_16BIT \
+ REGS_32BIT \
+ REGS_64BIT \
+ REGS_MMX \
+ REGS_XMM \
+ REGS_YMM \
+ REGS_SEGMENT \
+ REGS_DEBUG \
+ REGS_CONTROL \
+ ENTRY(RIP)
+
+/*
+ * EABase - All possible values of the base field for effective-address
+ * computations, a.k.a. the Mod and R/M fields of the ModR/M byte. We
+ * distinguish between bases (EA_BASE_*) and registers that just happen to be
+ * referred to when Mod == 0b11 (EA_REG_*).
+ */
+typedef enum {
+ EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ EA_max
+} EABase;
+
+/*
+ * SIBIndex - All possible values of the SIB index field.
+ * Borrows entries from ALL_EA_BASES with the special case that
+ * sib is synonymous with NONE.
+ * Vector SIB: index can be XMM or YMM.
+ */
+typedef enum {
+ SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+ ALL_EA_BASES
+ REGS_XMM
+ REGS_YMM
+#undef ENTRY
+ SIB_INDEX_max
+} SIBIndex;
+
+/*
+ * SIBBase - All possible values of the SIB base field.
+ */
+typedef enum {
+ SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+ ALL_SIB_BASES
+#undef ENTRY
+ SIB_BASE_max
+} SIBBase;
+
+/*
+ * EADisplacement - Possible displacement types for effective-address
+ * computations.
+ */
+typedef enum {
+ EA_DISP_NONE,
+ EA_DISP_8,
+ EA_DISP_16,
+ EA_DISP_32
+} EADisplacement;
+
+/*
+ * Reg - All possible values of the reg field in the ModR/M byte.
+ */
+typedef enum {
+#define ENTRY(x) MODRM_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ MODRM_REG_max
+} Reg;
+
+/*
+ * SegmentOverride - All possible segment overrides.
+ */
+typedef enum {
+ SEG_OVERRIDE_NONE,
+ SEG_OVERRIDE_CS,
+ SEG_OVERRIDE_SS,
+ SEG_OVERRIDE_DS,
+ SEG_OVERRIDE_ES,
+ SEG_OVERRIDE_FS,
+ SEG_OVERRIDE_GS,
+ SEG_OVERRIDE_max
+} SegmentOverride;
+
+/*
+ * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
+ */
+
+typedef enum {
+ VEX_LOB_0F = 0x1,
+ VEX_LOB_0F38 = 0x2,
+ VEX_LOB_0F3A = 0x3
+} VEXLeadingOpcodeByte;
+
+/*
+ * VEXPrefixCode - Possible values for the VEX.pp field
+ */
+
+typedef enum {
+ VEX_PREFIX_NONE = 0x0,
+ VEX_PREFIX_66 = 0x1,
+ VEX_PREFIX_F3 = 0x2,
+ VEX_PREFIX_F2 = 0x3
+} VEXPrefixCode;
+
+typedef uint8_t BOOL;
+
+/*
+ * byteReader_t - Type for the byte reader that the consumer must provide to
+ * the decoder. Reads a single byte from the instruction's address space.
+ * @param arg - A baton that the consumer can associate with any internal
+ * state that it needs.
+ * @param byte - A pointer to a single byte in memory that should be set to
+ * contain the value at address.
+ * @param address - The address in the instruction's address space that should
+ * be read from.
+ * @return - -1 if the byte cannot be read for any reason; 0 otherwise.
+ */
+typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
+
+/*
+ * dlog_t - Type for the logging function that the consumer can provide to
+ * get debugging output from the decoder.
+ * @param arg - A baton that the consumer can associate with any internal
+ * state that it needs.
+ * @param log - A string that contains the message. Will be reused after
+ * the logger returns.
+ */
+typedef void (*dlog_t)(void* arg, const char *log);
+
+/*
+ * The x86 internal instruction, which is produced by the decoder.
+ */
+struct InternalInstruction {
+ /* Reader interface (C) */
+ byteReader_t reader;
+ /* Opaque value passed to the reader */
+ const void* readerArg;
+ /* The address of the next byte to read via the reader */
+ uint64_t readerCursor;
+
+ /* Logger interface (C) */
+ dlog_t dlog;
+ /* Opaque value passed to the logger */
+ void* dlogArg;
+
+ /* General instruction information */
+
+ /* The mode to disassemble for (64-bit, protected, real) */
+ DisassemblerMode mode;
+ /* The start of the instruction, usable with the reader */
+ uint64_t startLocation;
+ /* The length of the instruction, in bytes */
+ size_t length;
+
+ /* Prefix state */
+
+ /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+ uint8_t prefixPresent[0x100];
+ /* contains the location (for use with the reader) of the prefix byte */
+ uint64_t prefixLocations[0x100];
+ /* The value of the VEX prefix, if present */
+ uint8_t vexPrefix[3];
+ /* The length of the VEX prefix (0 if not present) */
+ uint8_t vexSize;
+ /* The value of the REX prefix, if present */
+ uint8_t rexPrefix;
+ /* The location where a mandatory prefix would have to be (i.e., right before
+ the opcode, or right before the REX prefix if one is present) */
+ uint64_t necessaryPrefixLocation;
+ /* The segment override type */
+ SegmentOverride segmentOverride;
+
+ /* Sizes of various critical pieces of data, in bytes */
+ uint8_t registerSize;
+ uint8_t addressSize;
+ uint8_t displacementSize;
+ uint8_t immediateSize;
+
+ /* Offsets from the start of the instruction to the pieces of data, which is
+ needed to find relocation entries for adding symbolic operands */
+ uint8_t displacementOffset;
+ uint8_t immediateOffset;
+
+ /* opcode state */
+
+ /* The value of the two-byte escape prefix (usually 0x0f) */
+ uint8_t twoByteEscape;
+ /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
+ uint8_t threeByteEscape;
+ /* The last byte of the opcode, not counting any ModR/M extension */
+ uint8_t opcode;
+ /* The ModR/M byte of the instruction, if it is an opcode extension */
+ uint8_t modRMExtension;
+
+ /* decode state */
+
+ /* The type of opcode, used for indexing into the array of decode tables */
+ OpcodeType opcodeType;
+ /* The instruction ID, extracted from the decode table */
+ uint16_t instructionID;
+ /* The specifier for the instruction, from the instruction info table */
+ const struct InstructionSpecifier *spec;
+
+ /* state for additional bytes, consumed during operand decode. Pattern:
+ consumed___ indicates that the byte was already consumed and does not
+ need to be consumed again */
+
+ /* The VEX.vvvv field, which contains a third register operand for some AVX
+ instructions */
+ Reg vvvv;
+
+ /* The ModR/M byte, which contains most register operands and some portion of
+ all memory operands */
+ BOOL consumedModRM;
+ uint8_t modRM;
+
+ /* The SIB byte, used for more complex 32- or 64-bit memory operands */
+ BOOL consumedSIB;
+ uint8_t sib;
+
+ /* The displacement, used for memory operands */
+ BOOL consumedDisplacement;
+ int32_t displacement;
+
+ /* Immediates. There can be two in some cases */
+ uint8_t numImmediatesConsumed;
+ uint8_t numImmediatesTranslated;
+ uint64_t immediates[2];
+
+ /* A register or immediate operand encoded into the opcode */
+ BOOL consumedOpcodeModifier;
+ uint8_t opcodeModifier;
+ Reg opcodeRegister;
+
+ /* Portions of the ModR/M byte */
+
+ /* These fields determine the allowable values for the ModR/M fields, which
+ depend on operand and address widths */
+ EABase eaBaseBase;
+ EABase eaRegBase;
+ Reg regBase;
+
+ /* The Mod and R/M fields can encode a base for an effective address, or a
+ register. These are separated into two fields here */
+ EABase eaBase;
+ EADisplacement eaDisplacement;
+ /* The reg field always encodes a register */
+ Reg reg;
+
+ /* SIB state */
+ SIBIndex sibIndex;
+ uint8_t sibScale;
+ SIBBase sibBase;
+
+ const struct OperandSpecifier *operands;
+};
+
+/* decodeInstruction - Decode one instruction and store the decoding results in
+ * a buffer provided by the consumer.
+ * @param insn - The buffer to store the instruction in. Allocated by the
+ * consumer.
+ * @param reader - The byteReader_t for the bytes to be read.
+ * @param readerArg - An argument to pass to the reader for storing context
+ * specific to the consumer. May be NULL.
+ * @param logger - The dlog_t to be used in printing status messages from the
+ * disassembler. May be NULL.
+ * @param loggerArg - An argument to pass to the logger for storing context
+ * specific to the logger. May be NULL.
+ * @param startLoc - The address (in the reader's address space) of the first
+ * byte in the instruction.
+ * @param mode - The mode (16-bit, 32-bit, 64-bit) to decode in.
+ * @return - Nonzero if there was an error during decode, 0 otherwise.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+ byteReader_t reader,
+ const void* readerArg,
+ dlog_t logger,
+ void* loggerArg,
+ const void* miiArg,
+ uint64_t startLoc,
+ DisassemblerMode mode);
+
+/* x86DisassemblerDebug - C-accessible function for printing a message to
+ * debugs()
+ * @param file - The name of the file printing the debug message.
+ * @param line - The line number that printed the debug message.
+ * @param s - The message to print.
+ */
+
+void x86DisassemblerDebug(const char *file,
+ unsigned line,
+ const char *s);
+
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 0000000..23dfe4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,398 @@
+/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===*
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains common definitions used by both the disassembler and the table
+ * generator.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+/*
+ * This header file provides those definitions that need to be shared between
+ * the decoder and the table generator in a C-friendly manner.
+ */
+
+#ifndef X86DISASSEMBLERDECODERCOMMON_H
+#define X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM x86DisassemblerContexts
+#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes
+#define THREEBYTEA6_SYM x86DisassemblerThreeByteA6Opcodes
+#define THREEBYTEA7_SYM x86DisassemblerThreeByteA7Opcodes
+
+#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR "x86DisassemblerContexts"
+#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes"
+#define THREEBYTEA6_STR "x86DisassemblerThreeByteA6Opcodes"
+#define THREEBYTEA7_STR "x86DisassemblerThreeByteA7Opcodes"
+
+/*
+ * Attributes of an instruction that must be known before the opcode can be
+ * processed correctly. Most of these indicate the presence of particular
+ * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+ */
+#define ATTRIBUTE_BITS \
+ ENUM_ENTRY(ATTR_NONE, 0x00) \
+ ENUM_ENTRY(ATTR_64BIT, 0x01) \
+ ENUM_ENTRY(ATTR_XS, 0x02) \
+ ENUM_ENTRY(ATTR_XD, 0x04) \
+ ENUM_ENTRY(ATTR_REXW, 0x08) \
+ ENUM_ENTRY(ATTR_OPSIZE, 0x10) \
+ ENUM_ENTRY(ATTR_ADSIZE, 0x20) \
+ ENUM_ENTRY(ATTR_VEX, 0x40) \
+ ENUM_ENTRY(ATTR_VEXL, 0x80)
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+ ATTRIBUTE_BITS
+ ATTR_max
+};
+#undef ENUM_ENTRY
+
+/*
+ * Combinations of the above attributes that are relevant to instruction
+ * decode. Although other combinations are possible, they can be reduced to
+ * these without affecting the ultimately decoded instruction.
+ */
+
+/* Class name Rank Rationale for rank assignment */
+#define INSTRUCTION_CONTEXTS \
+ ENUM_ENTRY(IC, 0, "says nothing about the instruction") \
+ ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \
+ "64-bit mode but no more") \
+ ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\
+ "change width; overrides IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \
+ ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \
+ "secondary") \
+ ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \
+ ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_REXW_XS, 6, "OPSIZE could mean a different " \
+ "opcode") \
+ ENUM_ENTRY(IC_64BIT_REXW_XD, 6, "Just as meaningful as " \
+ "IC_64BIT_REXW_XS") \
+ ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \
+ "else because this changes most " \
+ "operands' meaning") \
+ ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \
+ ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \
+ ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \
+ ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \
+ ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \
+ ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \
+ ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \
+ ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize")
+
+
+#define ENUM_ENTRY(n, r, d) n,
+typedef enum {
+ INSTRUCTION_CONTEXTS
+ IC_max
+} InstructionContext;
+#undef ENUM_ENTRY
+
+/*
+ * Opcode types, which determine which decode table to use, both in the Intel
+ * manual and also for the decoder.
+ */
+typedef enum {
+ ONEBYTE = 0,
+ TWOBYTE = 1,
+ THREEBYTE_38 = 2,
+ THREEBYTE_3A = 3,
+ THREEBYTE_A6 = 4,
+ THREEBYTE_A7 = 5
+} OpcodeType;
+
+/*
+ * The following structs are used for the hierarchical decode table. After
+ * determining the instruction's class (i.e., which IC_* constant applies to
+ * it), the decoder reads the opcode. Some instructions require specific
+ * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+ *
+ * If a ModR/M byte is not required, "required" is left unset, and the values
+ * for each instructionID are identical.
+ */
+
+typedef uint16_t InstrUID;
+
+/*
+ * ModRMDecisionType - describes the type of ModR/M decision, allowing the
+ * consumer to determine the number of entries in it.
+ *
+ * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+ * instruction is the same.
+ * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+ * corresponds to one instruction; otherwise, it corresponds to
+ * a different instruction.
+ * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+ * divided by 8 is used to select instruction; otherwise, each
+ * value of the ModR/M byte could correspond to a different
+ * instruction.
+ * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+ corresponds to instructions that use reg field as opcode
+ * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
+ * to a different instruction.
+ */
+
+#define MODRMTYPES \
+ ENUM_ENTRY(MODRM_ONEENTRY) \
+ ENUM_ENTRY(MODRM_SPLITRM) \
+ ENUM_ENTRY(MODRM_SPLITMISC) \
+ ENUM_ENTRY(MODRM_SPLITREG) \
+ ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+ MODRMTYPES
+ MODRM_max
+} ModRMDecisionType;
+#undef ENUM_ENTRY
+
+/*
+ * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
+ * instruction each possible value of the ModR/M byte corresponds to. Once
+ * this information is known, we have narrowed down to a single instruction.
+ */
+struct ModRMDecision {
+ uint8_t modrm_type;
+
+ /* The macro below must be defined wherever this file is included. */
+ INSTRUCTION_IDS
+};
+
+/*
+ * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
+ * given a particular opcode.
+ */
+struct OpcodeDecision {
+ struct ModRMDecision modRMDecisions[256];
+};
+
+/*
+ * ContextDecision - Specifies which opcode->instruction tables to look at given
+ * a particular context (set of attributes). Since there are many possible
+ * contexts, the decoder first uses CONTEXTS_SYM to determine which context
+ * applies given a specific set of attributes. Hence there are only IC_max
+ * entries in this table, rather than 2^(ATTR_max).
+ */
+struct ContextDecision {
+ struct OpcodeDecision opcodeDecisions[IC_max];
+};
+
+/*
+ * Physical encodings of instruction operands.
+ */
+
+#define ENCODINGS \
+ ENUM_ENTRY(ENCODING_NONE, "") \
+ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \
+ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \
+ ENUM_ENTRY(ENCODING_CW, "2-byte") \
+ ENUM_ENTRY(ENCODING_CD, "4-byte") \
+ ENUM_ENTRY(ENCODING_CP, "6-byte") \
+ ENUM_ENTRY(ENCODING_CO, "8-byte") \
+ ENUM_ENTRY(ENCODING_CT, "10-byte") \
+ ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \
+ ENUM_ENTRY(ENCODING_IW, "2-byte") \
+ ENUM_ENTRY(ENCODING_ID, "4-byte") \
+ ENUM_ENTRY(ENCODING_IO, "8-byte") \
+ ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \
+ "the opcode byte") \
+ ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \
+ ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \
+ ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \
+ ENUM_ENTRY(ENCODING_I, "Position on floating-point stack added to the " \
+ "opcode byte") \
+ \
+ ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \
+ ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \
+ ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \
+ "opcode byte") \
+ ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \
+ "in type")
+
+#define ENUM_ENTRY(n, d) n,
+ typedef enum {
+ ENCODINGS
+ ENCODING_max
+ } OperandEncoding;
+#undef ENUM_ENTRY
+
+/*
+ * Semantic interpretations of instruction operands.
+ */
+
+#define TYPES \
+ ENUM_ENTRY(TYPE_NONE, "") \
+ ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \
+ ENUM_ENTRY(TYPE_REL16, "2-byte") \
+ ENUM_ENTRY(TYPE_REL32, "4-byte") \
+ ENUM_ENTRY(TYPE_REL64, "8-byte") \
+ ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_R8, "1-byte register operand") \
+ ENUM_ENTRY(TYPE_R16, "2-byte") \
+ ENUM_ENTRY(TYPE_R32, "4-byte") \
+ ENUM_ENTRY(TYPE_R64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \
+ ENUM_ENTRY(TYPE_IMM16, "2-byte") \
+ ENUM_ENTRY(TYPE_IMM32, "4-byte") \
+ ENUM_ENTRY(TYPE_IMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \
+ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \
+ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \
+ ENUM_ENTRY(TYPE_RM16, "2-byte") \
+ ENUM_ENTRY(TYPE_RM32, "4-byte") \
+ ENUM_ENTRY(TYPE_RM64, "8-byte") \
+ ENUM_ENTRY(TYPE_M, "Memory operand") \
+ ENUM_ENTRY(TYPE_M8, "1-byte") \
+ ENUM_ENTRY(TYPE_M16, "2-byte") \
+ ENUM_ENTRY(TYPE_M32, "4-byte") \
+ ENUM_ENTRY(TYPE_M64, "8-byte") \
+ ENUM_ENTRY(TYPE_LEA, "Effective address") \
+ ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \
+ ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \
+ ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_M1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_M1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_M16_32, "2+4-byte two-part memory operand (LIDT, LGDT)") \
+ ENUM_ENTRY(TYPE_M16_16, "2+2-byte (BOUND)") \
+ ENUM_ENTRY(TYPE_M32_32, "4+4-byte (BOUND)") \
+ ENUM_ENTRY(TYPE_M16_64, "2+8-byte (LIDT, LGDT)") \
+ ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \
+ "base)") \
+ ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \
+ ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \
+ ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \
+ ENUM_ENTRY(TYPE_SREG, "Byte with single bit set: 0 = ES, 1 = CS, " \
+ "2 = SS, 3 = DS, 4 = FS, 5 = GS") \
+ ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \
+ ENUM_ENTRY(TYPE_M64FP, "64-bit") \
+ ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \
+ ENUM_ENTRY(TYPE_M16INT, "2-byte memory integer operand for use in " \
+ "floating-point instructions") \
+ ENUM_ENTRY(TYPE_M32INT, "4-byte") \
+ ENUM_ENTRY(TYPE_M64INT, "8-byte") \
+ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \
+ ENUM_ENTRY(TYPE_MM, "MMX register operand") \
+ ENUM_ENTRY(TYPE_MM32, "4-byte MMX register or memory operand") \
+ ENUM_ENTRY(TYPE_MM64, "8-byte") \
+ ENUM_ENTRY(TYPE_XMM, "XMM register operand") \
+ ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \
+ ENUM_ENTRY(TYPE_XMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_XMM128, "16-byte") \
+ ENUM_ENTRY(TYPE_XMM256, "32-byte") \
+ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \
+ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
+ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
+ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
+ \
+ ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \
+ ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \
+ ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \
+ ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \
+ ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \
+ ENUM_ENTRY(TYPE_DUP1, "operand 1") \
+ ENUM_ENTRY(TYPE_DUP2, "operand 2") \
+ ENUM_ENTRY(TYPE_DUP3, "operand 3") \
+ ENUM_ENTRY(TYPE_DUP4, "operand 4") \
+ ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+typedef enum {
+ TYPES
+ TYPE_max
+} OperandType;
+#undef ENUM_ENTRY
+
+/*
+ * OperandSpecifier - The specification for how to extract and interpret one
+ * operand.
+ */
+struct OperandSpecifier {
+ uint8_t encoding;
+ uint8_t type;
+};
+
+/*
+ * Indicates where the opcode modifier (if any) is to be found. Extended
+ * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
+ */
+
+#define MODIFIER_TYPES \
+ ENUM_ENTRY(MODIFIER_NONE) \
+ ENUM_ENTRY(MODIFIER_OPCODE) \
+ ENUM_ENTRY(MODIFIER_MODRM)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+ MODIFIER_TYPES
+ MODIFIER_max
+} ModifierType;
+#undef ENUM_ENTRY
+
+#define X86_MAX_OPERANDS 5
+
+/*
+ * The specification for how to extract and interpret a full instruction and
+ * its operands.
+ */
+struct InstructionSpecifier {
+ uint8_t modifierType;
+ uint8_t modifierBase;
+
+ /* The macro below must be defined wherever this file is included. */
+ INSTRUCTION_SPECIFIER_FIELDS
+};
+
+/*
+ * Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
+ * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+ * respectively.
+ */
+typedef enum {
+ MODE_16BIT,
+ MODE_32BIT,
+ MODE_64BIT
+} DisassemblerMode;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 0000000..e357710
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,218 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <map>
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS,
+ unsigned RegNo) const {
+ OS << markup("<reg:")
+ << '%' << getRegisterName(RegNo)
+ << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (TSFlags & X86II::LOCK)
+ OS << "\tlock\n";
+
+ // Try to print any aliases first.
+ if (!printAliasInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ }
+}
+
+void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+ switch (Imm) {
+ default: llvm_unreachable("Invalid avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). These
+/// print slightly differently than normal immediates. For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+ O << "0x";
+ O.write_hex(Address);
+ }
+ else {
+ // Otherwise, just print the expression.
+ O << *Op.getExpr();
+ }
+ }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ // Print X86 immediates as signed values.
+ O << markup("<imm:")
+ << '$' << formatImm((int64_t)Op.getImm())
+ << markup(">");
+
+ if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
+ *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
+
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << markup("<imm:")
+ << '$' << *Op.getExpr()
+ << markup(">");
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op);
+ const MCOperand &IndexReg = MI->getOperand(Op+2);
+ const MCOperand &DispSpec = MI->getOperand(Op+3);
+ const MCOperand &SegReg = MI->getOperand(Op+4);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+4, O);
+ O << ':';
+ }
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << formatImm(DispVal);
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ O << *DispSpec.getExpr();
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op+2, O);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ if (ScaleVal != 1) {
+ O << ','
+ << markup("<imm:")
+ << ScaleVal // never printed in hex.
+ << markup(">");
+ }
+ }
+ O << ')';
+ }
+
+ O << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..8e09183
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,87 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ATT_INST_PRINTER_H
+#define X86_ATT_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class X86ATTInstPrinter : public MCInstPrinter {
+public:
+ X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+ virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &OS);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 0000000..0f6eeb1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,511 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired. This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned)) {
+ // If this is a shuffle operation, the switch should fill in this state.
+ SmallVector<int, 8> ShuffleMask;
+ const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
+
+ switch (MI->getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
+ break;
+
+ case X86::MOVLHPSrr:
+ case X86::VMOVLHPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVLHPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVHLPSMask(2, ShuffleMask);
+ break;
+
+ case X86::PALIGNR128rr:
+ case X86::VPALIGNR128rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PALIGNR128rm:
+ case X86::VPALIGNR128rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePALIGNRMask(MVT::v16i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPALIGNR256rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPALIGNR256rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePALIGNRMask(MVT::v32i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSHUFDri:
+ case X86::VPSHUFDri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFDmi:
+ case X86::VPSHUFDmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFMask(MVT::v4i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFDYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFMask(MVT::v8i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+
+ case X86::PSHUFHWri:
+ case X86::VPSHUFHWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFHWmi:
+ case X86::VPSHUFHWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFHWMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFHWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFHWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFHWMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::PSHUFLWri:
+ case X86::VPSHUFLWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFLWmi:
+ case X86::VPSHUFLWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFLWMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFLWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFLWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFLWMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PUNPCKHBWrr:
+ case X86::VPUNPCKHBWrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHBWrm:
+ case X86::VPUNPCKHBWrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+ break;
+ case X86::VPUNPCKHBWYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHBWYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
+ break;
+ case X86::PUNPCKHWDrr:
+ case X86::VPUNPCKHWDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHWDrm:
+ case X86::VPUNPCKHWDrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+ break;
+ case X86::VPUNPCKHWDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHWDYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
+ break;
+ case X86::PUNPCKHDQrr:
+ case X86::VPUNPCKHDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHDQrm:
+ case X86::VPUNPCKHDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+ break;
+ case X86::VPUNPCKHDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
+ break;
+ case X86::PUNPCKHQDQrr:
+ case X86::VPUNPCKHQDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHQDQrm:
+ case X86::VPUNPCKHQDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+ break;
+ case X86::VPUNPCKHQDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHQDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
+ break;
+
+ case X86::PUNPCKLBWrr:
+ case X86::VPUNPCKLBWrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLBWrm:
+ case X86::VPUNPCKLBWrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+ break;
+ case X86::VPUNPCKLBWYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLBWYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
+ break;
+ case X86::PUNPCKLWDrr:
+ case X86::VPUNPCKLWDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLWDrm:
+ case X86::VPUNPCKLWDrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+ break;
+ case X86::VPUNPCKLWDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLWDYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+ break;
+ case X86::PUNPCKLDQrr:
+ case X86::VPUNPCKLDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLDQrm:
+ case X86::VPUNPCKLDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+ break;
+ case X86::VPUNPCKLDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+ break;
+ case X86::PUNPCKLQDQrr:
+ case X86::VPUNPCKLQDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLQDQrm:
+ case X86::VPUNPCKLQDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+ break;
+ case X86::VPUNPCKLQDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLQDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+ break;
+
+ case X86::SHUFPDrri:
+ case X86::VSHUFPDrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::SHUFPDrmi:
+ case X86::VSHUFPDrmi:
+ DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VSHUFPDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VSHUFPDYrmi:
+ DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::SHUFPSrri:
+ case X86::VSHUFPSrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::SHUFPSrmi:
+ case X86::VSHUFPSrmi:
+ DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VSHUFPSYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VSHUFPSYrmi:
+ DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::UNPCKLPDrr:
+ case X86::VUNPCKLPDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKLPDrm:
+ case X86::VUNPCKLPDrm:
+ DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKLPDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKLPDYrm:
+ DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::UNPCKLPSrr:
+ case X86::VUNPCKLPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKLPSrm:
+ case X86::VUNPCKLPSrm:
+ DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKLPSYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKLPSYrm:
+ DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::UNPCKHPDrr:
+ case X86::VUNPCKHPDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKHPDrm:
+ case X86::VUNPCKHPDrm:
+ DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKHPDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKHPDYrm:
+ DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::UNPCKHPSrr:
+ case X86::VUNPCKHPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKHPSrm:
+ case X86::VUNPCKHPSrm:
+ DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKHPSYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKHPSYrm:
+ DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPSri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPSmi:
+ DecodePSHUFMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPSYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPSYmi:
+ DecodePSHUFMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPDri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPDmi:
+ DecodePSHUFMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPDYmi:
+ DecodePSHUFMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPERM2F128rm:
+ case X86::VPERM2I128rm:
+ // For instruction comments purpose, assume the 256-bit vector is v4i64.
+ DecodeVPERM2X128Mask(MVT::v4i64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMQYri:
+ case X86::VPERMPDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMQYmi:
+ case X86::VPERMPDYmi:
+ DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ }
+
+
+ // If this was a shuffle operation, print the shuffle mask.
+ if (!ShuffleMask.empty()) {
+ if (DestName == 0) DestName = Src1Name;
+ OS << (DestName ? DestName : "mem") << " = ";
+
+ // If the two sources are the same, canonicalize the input elements to be
+ // from the first src so that we get larger element spans.
+ if (Src1Name == Src2Name) {
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+ ShuffleMask[i] >= (int)e) // From second mask.
+ ShuffleMask[i] -= e;
+ }
+ }
+
+ // The shuffle mask specifies which elements of the src1/src2 fill in the
+ // destination, with a few sentinel values. Loop through and print them
+ // out.
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ OS << ',';
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ OS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+ const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+ OS << (SrcName ? SrcName : "mem") << '[';
+ bool IsFirst = true;
+ while (i != e &&
+ (int)ShuffleMask[i] >= 0 &&
+ (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+ if (!IsFirst)
+ OS << ',';
+ else
+ IsFirst = false;
+ OS << ShuffleMask[i] % ShuffleMask.size();
+ ++i;
+ }
+ OS << ']';
+ --i; // For loop increments element #.
+ }
+ //MI->print(OS, 0);
+ OS << "\n";
+ }
+
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 0000000..13fdf9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,25 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INST_COMMENTS_H
+#define X86_INST_COMMENTS_H
+
+namespace llvm {
+ class MCInst;
+ class raw_ostream;
+ void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..141f4a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,208 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (TSFlags & X86II::LOCK)
+ OS << "\tlock\n";
+
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ }
+}
+
+void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+ switch (Imm) {
+ default: llvm_unreachable("Invalid avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << Op.getImm();
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+ O << "0x";
+ O.write_hex(Address);
+ }
+ else {
+ // Otherwise, just print the expression.
+ O << *Op.getExpr();
+ }
+ }
+}
+
+static void PrintRegName(raw_ostream &O, StringRef RegName) {
+ for (unsigned i = 0, e = RegName.size(); i != e; ++i)
+ O << (char)toupper(RegName[i]);
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ PrintRegName(O, getRegisterName(Op.getReg()));
+ } else if (Op.isImm()) {
+ O << Op.getImm();
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << *Op.getExpr();
+ }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+2);
+ const MCOperand &DispSpec = MI->getOperand(Op+3);
+ const MCOperand &SegReg = MI->getOperand(Op+4);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+4, O);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+2, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ O << *DispSpec.getExpr();
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+
+ O << ']';
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..bb769eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,96 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INTEL_INST_PRINTER_H
+#define X86_INTEL_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class X86IntelInstPrinter : public MCInstPrinter {
+public:
+ X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+ virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "OPAQUE PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "BYTE PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "WORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "YMMWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "XWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "YMMWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 0000000..598ddee
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,476 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Option to allow disabling arithmetic relaxation to workaround PR9807, which
+// is useful when running bitwise comparison experiments on Darwin. We should be
+// able to remove this once PR9807 is resolved.
+static cl::opt<bool>
+MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
+ cl::desc("Disable relaxation of arithmetic instruction for X86"));
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default: llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_SecRel_1:
+ case FK_Data_1: return 0;
+ case FK_PCRel_2:
+ case FK_SecRel_2:
+ case FK_Data_2: return 1;
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_global_offset_table:
+ case FK_SecRel_4:
+ case FK_Data_4: return 2;
+ case FK_PCRel_8:
+ case FK_SecRel_8:
+ case FK_Data_8: return 3;
+ }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+ bool HasRelocationAddend, bool foobar)
+ : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+ StringRef CPU;
+public:
+ X86AsmBackend(const Target &T, StringRef _CPU)
+ : MCAsmBackend(), CPU(_CPU) {}
+
+ unsigned getNumFixupKinds() const {
+ return X86::NumTargetFixupKinds;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+ { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+ { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
+ { "reloc_signed_4byte", 0, 4 * 8, 0},
+ { "reloc_global_offset_table", 0, 4 * 8, 0}
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value) const {
+ unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+ assert(Fixup.getOffset() + Size <= DataSize &&
+ "Invalid fixup offset!");
+
+ // Check that uppper bits are either all zeros or all ones.
+ // Specifically ignore overflow/underflow as long as the leakage is
+ // limited to the lower bits. This is to remain compatible with
+ // other assemblers.
+ assert(isIntN(Size * 8 + 1, Value) &&
+ "Value does not fit in the Fixup field");
+
+ for (unsigned i = 0; i != Size; ++i)
+ Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const;
+
+ void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+ switch (Op) {
+ default:
+ return Op;
+
+ case X86::JAE_1: return X86::JAE_4;
+ case X86::JA_1: return X86::JA_4;
+ case X86::JBE_1: return X86::JBE_4;
+ case X86::JB_1: return X86::JB_4;
+ case X86::JE_1: return X86::JE_4;
+ case X86::JGE_1: return X86::JGE_4;
+ case X86::JG_1: return X86::JG_4;
+ case X86::JLE_1: return X86::JLE_4;
+ case X86::JL_1: return X86::JL_4;
+ case X86::JMP_1: return X86::JMP_4;
+ case X86::JNE_1: return X86::JNE_4;
+ case X86::JNO_1: return X86::JNO_4;
+ case X86::JNP_1: return X86::JNP_4;
+ case X86::JNS_1: return X86::JNS_4;
+ case X86::JO_1: return X86::JO_4;
+ case X86::JP_1: return X86::JP_4;
+ case X86::JS_1: return X86::JS_4;
+ }
+}
+
+static unsigned getRelaxedOpcodeArith(unsigned Op) {
+ switch (Op) {
+ default:
+ return Op;
+
+ // IMUL
+ case X86::IMUL16rri8: return X86::IMUL16rri;
+ case X86::IMUL16rmi8: return X86::IMUL16rmi;
+ case X86::IMUL32rri8: return X86::IMUL32rri;
+ case X86::IMUL32rmi8: return X86::IMUL32rmi;
+ case X86::IMUL64rri8: return X86::IMUL64rri32;
+ case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+ // AND
+ case X86::AND16ri8: return X86::AND16ri;
+ case X86::AND16mi8: return X86::AND16mi;
+ case X86::AND32ri8: return X86::AND32ri;
+ case X86::AND32mi8: return X86::AND32mi;
+ case X86::AND64ri8: return X86::AND64ri32;
+ case X86::AND64mi8: return X86::AND64mi32;
+
+ // OR
+ case X86::OR16ri8: return X86::OR16ri;
+ case X86::OR16mi8: return X86::OR16mi;
+ case X86::OR32ri8: return X86::OR32ri;
+ case X86::OR32mi8: return X86::OR32mi;
+ case X86::OR64ri8: return X86::OR64ri32;
+ case X86::OR64mi8: return X86::OR64mi32;
+
+ // XOR
+ case X86::XOR16ri8: return X86::XOR16ri;
+ case X86::XOR16mi8: return X86::XOR16mi;
+ case X86::XOR32ri8: return X86::XOR32ri;
+ case X86::XOR32mi8: return X86::XOR32mi;
+ case X86::XOR64ri8: return X86::XOR64ri32;
+ case X86::XOR64mi8: return X86::XOR64mi32;
+
+ // ADD
+ case X86::ADD16ri8: return X86::ADD16ri;
+ case X86::ADD16mi8: return X86::ADD16mi;
+ case X86::ADD32ri8: return X86::ADD32ri;
+ case X86::ADD32mi8: return X86::ADD32mi;
+ case X86::ADD64ri8: return X86::ADD64ri32;
+ case X86::ADD64mi8: return X86::ADD64mi32;
+
+ // SUB
+ case X86::SUB16ri8: return X86::SUB16ri;
+ case X86::SUB16mi8: return X86::SUB16mi;
+ case X86::SUB32ri8: return X86::SUB32ri;
+ case X86::SUB32mi8: return X86::SUB32mi;
+ case X86::SUB64ri8: return X86::SUB64ri32;
+ case X86::SUB64mi8: return X86::SUB64mi32;
+
+ // CMP
+ case X86::CMP16ri8: return X86::CMP16ri;
+ case X86::CMP16mi8: return X86::CMP16mi;
+ case X86::CMP32ri8: return X86::CMP32ri;
+ case X86::CMP32mi8: return X86::CMP32mi;
+ case X86::CMP64ri8: return X86::CMP64ri32;
+ case X86::CMP64mi8: return X86::CMP64mi32;
+
+ // PUSH
+ case X86::PUSHi8: return X86::PUSHi32;
+ case X86::PUSHi16: return X86::PUSHi32;
+ case X86::PUSH64i8: return X86::PUSH64i32;
+ case X86::PUSH64i16: return X86::PUSH64i32;
+ }
+}
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+ unsigned R = getRelaxedOpcodeArith(Op);
+ if (R != Op)
+ return R;
+ return getRelaxedOpcodeBranch(Op);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+ // Branches can always be relaxed.
+ if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+ return true;
+
+ if (MCDisableArithRelaxation)
+ return false;
+
+ // Check if this instruction is ever relaxable.
+ if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+ return false;
+
+
+ // Check if it has an expression and is not RIP relative.
+ bool hasExp = false;
+ bool hasRIP = false;
+ for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
+ const MCOperand &Op = Inst.getOperand(i);
+ if (Op.isExpr())
+ hasExp = true;
+
+ if (Op.isReg() && Op.getReg() == X86::RIP)
+ hasRIP = true;
+ }
+
+ // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on
+ // how we do relaxations?
+ return hasExp && !hasRIP;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // Relax if the value is too big for a (signed) i8.
+ return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+ // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+ unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ Res = Inst;
+ Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ static const uint8_t Nops[10][10] = {
+ // nop
+ {0x90},
+ // xchg %ax,%ax
+ {0x66, 0x90},
+ // nopl (%[re]ax)
+ {0x0f, 0x1f, 0x00},
+ // nopl 0(%[re]ax)
+ {0x0f, 0x1f, 0x40, 0x00},
+ // nopl 0(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopw 0(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopl 0L(%[re]ax)
+ {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw 0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw %cs:0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ };
+
+ // This CPU doesnt support long nops. If needed add more.
+ // FIXME: Can we get this from the subtarget somehow?
+ if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
+ CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+ for (uint64_t i = 0; i < Count; ++i)
+ OW->Write8(0x90);
+ return true;
+ }
+
+ // 15 is the longest single nop instruction. Emit as many 15-byte nops as
+ // needed, then emit a nop of the remaining length.
+ do {
+ const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+ const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+ for (uint8_t i = 0; i < Prefixes; i++)
+ OW->Write8(0x66);
+ const uint8_t Rest = ThisNopLength - Prefixes;
+ for (uint8_t i = 0; i < Rest; i++)
+ OW->Write8(Nops[Rest - 1][i]);
+ Count -= ThisNopLength;
+ } while (Count != 0);
+
+ return true;
+}
+
+/* *** */
+
+namespace {
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+ uint8_t OSABI;
+ ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
+ : X86AsmBackend(T, CPU), OSABI(_OSABI) {
+ HasReliableSymbolDifference = true;
+ }
+
+ virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+ const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section);
+ return ES.getFlags() & ELF::SHF_MERGE;
+ }
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+ }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+ }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+ bool Is64Bit;
+
+public:
+ WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+ : X86AsmBackend(T, CPU)
+ , Is64Bit(is64Bit) {
+ }
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return createX86WinCOFFObjectWriter(OS, Is64Bit);
+ }
+};
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+public:
+ DarwinX86AsmBackend(const Target &T, StringRef CPU)
+ : X86AsmBackend(T, CPU) { }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+ DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
+ : DarwinX86AsmBackend(T, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+ object::mach::CTM_i386,
+ object::mach::CSX86_ALL);
+ }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+public:
+ DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
+ : DarwinX86AsmBackend(T, CPU) {
+ HasReliableSymbolDifference = true;
+ }
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+ object::mach::CTM_x86_64,
+ object::mach::CSX86_ALL);
+ }
+
+ virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+ // Temporary labels in the string literals sections require symbols. The
+ // issue is that the x86_64 relocation format does not allow symbol +
+ // offset, and so the linker does not have enough information to resolve the
+ // access to the appropriate atom unless an external relocation is used. For
+ // non-cstring sections, we expect the compiler to use a non-temporary label
+ // for anything that could have an addend pointing outside the symbol.
+ //
+ // See <rdar://problem/4765733>.
+ const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+ return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS;
+ }
+
+ virtual bool isSectionAtomizable(const MCSection &Section) const {
+ const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+ // Fixed sized data sections are uniqued, they cannot be diced into atoms.
+ switch (SMO.getType()) {
+ default:
+ return true;
+
+ case MCSectionMachO::S_4BYTE_LITERALS:
+ case MCSectionMachO::S_8BYTE_LITERALS:
+ case MCSectionMachO::S_16BYTE_LITERALS:
+ case MCSectionMachO::S_LITERAL_POINTERS:
+ case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS:
+ case MCSectionMachO::S_LAZY_SYMBOL_POINTERS:
+ case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS:
+ case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS:
+ case MCSectionMachO::S_INTERPOSING:
+ return false;
+ }
+ }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+ Triple TheTriple(TT);
+
+ if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+ return new DarwinX86_32AsmBackend(T, CPU);
+
+ if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+ return new WindowsX86AsmBackend(T, false, CPU);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+ Triple TheTriple(TT);
+
+ if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+ return new DarwinX86_64AsmBackend(T, CPU);
+
+ if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+ return new WindowsX86AsmBackend(T, true, CPU);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 0000000..d8f7278
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,643 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86BASEINFO_H
+#define X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+namespace llvm {
+
+namespace X86 {
+ // Enums for memory operand decoding. Each memory operand is represented with
+ // a 5 operand sequence in the form:
+ // [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+ // These enums help decode this.
+ enum {
+ AddrBaseReg = 0,
+ AddrScaleAmt = 1,
+ AddrIndexReg = 2,
+ AddrDisp = 3,
+
+ /// AddrSegmentReg - The operand # of the segment in the memory operand.
+ AddrSegmentReg = 4,
+
+ /// AddrNumOperands - Total number of operands in a memory reference.
+ AddrNumOperands = 5
+ };
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // X86 Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+ /// relocation of:
+ /// SYMBOL_LABEL + [. - PICBASELABEL]
+ MO_GOT_ABSOLUTE_ADDRESS,
+
+ /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+ /// immediate should get the value of the symbol minus the PIC base label:
+ /// SYMBOL_LABEL - PICBASELABEL
+ MO_PIC_BASE_OFFSET,
+
+ /// MO_GOT - On a symbol operand this indicates that the immediate is the
+ /// offset to the GOT entry for the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOT
+ MO_GOT,
+
+ /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+ /// the offset to the location of the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTOFF
+ MO_GOTOFF,
+
+ /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+ /// offset to the GOT entry for the symbol name from the current code
+ /// location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTPCREL
+ MO_GOTPCREL,
+
+ /// MO_PLT - On a symbol operand this indicates that the immediate is
+ /// offset to the PLT entry of symbol name from the current code location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @PLT
+ MO_PLT,
+
+ /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index structure that contains
+ /// the module number and variable offset for the symbol. Used in the
+ /// general dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSGD
+ MO_TLSGD,
+
+ /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// __tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLD
+ MO_TLSLD,
+
+ /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// ___tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLDM
+ MO_TLSLDM,
+
+ /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the thread-pointer offset for the
+ /// symbol. Used in the x86-64 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTTPOFF
+ MO_GOTTPOFF,
+
+ /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the absolute address of the GOT entry with the negative thread-pointer
+ /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+ /// model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @INDNTPOFF
+ MO_INDNTPOFF,
+
+ /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+ /// the thread-pointer offset for the symbol. Used in the x86-64 local
+ /// exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TPOFF
+ MO_TPOFF,
+
+ /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS offset of the symbol. Used
+ /// in the local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @DTPOFF
+ MO_DTPOFF,
+
+ /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+ /// the negative thread-pointer offset for the symbol. Used in the IA32
+ /// local exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @NTPOFF
+ MO_NTPOFF,
+
+ /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the negative thread-pointer offset for
+ /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTNTPOFF
+ MO_GOTNTPOFF,
+
+ /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "__imp_FOO" symbol. This is used for
+ /// dllimport linkage on windows.
+ MO_DLLIMPORT,
+
+ /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$stub" symbol. This is used for calls
+ /// and jumps to external functions on Tiger and earlier.
+ MO_DARWIN_STUB,
+
+ /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+ /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY,
+
+ /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+ /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+ /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY_PIC_BASE,
+
+ /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+ /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+ /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+ /// stub.
+ MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+ /// MO_TLVP - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// This is the TLS offset for the Darwin TLS mechanism.
+ MO_TLVP,
+
+ /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+ /// is some TLS offset from the picbase.
+ ///
+ /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+ MO_TLVP_PIC_BASE,
+
+ /// MO_SECREL - On a symbol operand this indicates that the immediate is
+ /// the offset from beginning of section.
+ ///
+ /// This is the TLS offset for the COFF/Windows TLS mechanism.
+ MO_SECREL
+ };
+
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction encodings. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 3,
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 4,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 5,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 6,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ // First, instructions that operate on a register r/m operand...
+ MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3
+ MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3
+ MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7
+
+ // MRMInitReg - This form is used for instructions whose source and
+ // destinations are the same register.
+ MRMInitReg = 32,
+
+ //// MRM_XX - A mod/rm byte of exactly 0xXX.
+ MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
+ MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, MRM_CB = 40,
+ MRM_E8 = 41, MRM_F0 = 42, MRM_F8 = 45, MRM_F9 = 46,
+ MRM_D0 = 47, MRM_D1 = 48, MRM_D4 = 49, MRM_D5 = 50,
+ MRM_D6 = 51, MRM_D8 = 52, MRM_D9 = 53, MRM_DA = 54,
+ MRM_DB = 55, MRM_DC = 56, MRM_DD = 57, MRM_DE = 58,
+ MRM_DF = 59,
+
+ /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+ /// immediates, the first of which is a 16-bit immediate (specified by
+ /// the imm encoding) and the second is a 8-bit fixed value.
+ RawFrmImm8 = 43,
+
+ /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+ /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+ /// the imm encoding) and the second is a 16-bit fixed value. In the AMD
+ /// manual, this operand is described as pntr16:32 and pntr16:16
+ RawFrmImm16 = 44,
+
+ FormMask = 63,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - Set if this instruction requires an operand size prefix (0x66),
+ // which most often indicates that the instruction operates on 16 bit data
+ // instead of 32 bit data.
+ OpSize = 1 << 6,
+
+ // AsSize - Set if this instruction requires an operand size prefix (0x67),
+ // which most often indicates that the instruction address 16 bit address
+ // instead of 32 bit address (or 32 bit address in 64 bit mode).
+ AdSize = 1 << 7,
+
+ //===------------------------------------------------------------------===//
+ // Op0Mask - There are several prefix bytes that are used to form two byte
+ // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is
+ // used to obtain the setting of this field. If no bits in this field is
+ // set, there is no prefix byte for obtaining a multibyte opcode.
+ //
+ Op0Shift = 8,
+ Op0Mask = 0x1F << Op0Shift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << Op0Shift,
+
+ // REP - The 0xF3 prefix byte indicating repetition of the following
+ // instruction.
+ REP = 2 << Op0Shift,
+
+ // D8-DF - These escape opcodes are used by the floating point unit. These
+ // values must remain sequential.
+ D8 = 3 << Op0Shift, D9 = 4 << Op0Shift,
+ DA = 5 << Op0Shift, DB = 6 << Op0Shift,
+ DC = 7 << Op0Shift, DD = 8 << Op0Shift,
+ DE = 9 << Op0Shift, DF = 10 << Op0Shift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XD = 11 << Op0Shift, XS = 12 << Op0Shift,
+
+ // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
+ T8 = 13 << Op0Shift, TA = 14 << Op0Shift,
+ A6 = 15 << Op0Shift, A7 = 16 << Op0Shift,
+
+ // T8XD - Prefix before and after 0x0F. Combination of T8 and XD.
+ T8XD = 17 << Op0Shift,
+
+ // T8XS - Prefix before and after 0x0F. Combination of T8 and XS.
+ T8XS = 18 << Op0Shift,
+
+ // TAXD - Prefix before and after 0x0F. Combination of TA and XD.
+ TAXD = 19 << Op0Shift,
+
+ // XOP8 - Prefix to include use of imm byte.
+ XOP8 = 20 << Op0Shift,
+
+ // XOP9 - Prefix to exclude use of imm byte.
+ XOP9 = 21 << Op0Shift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = Op0Shift + 5,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = REXShift + 1,
+ ImmMask = 7 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm8PCRel = 2 << ImmShift,
+ Imm16 = 3 << ImmShift,
+ Imm16PCRel = 4 << ImmShift,
+ Imm32 = 5 << ImmShift,
+ Imm32PCRel = 6 << ImmShift,
+ Imm64 = 7 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = ImmShift + 3,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Lock prefix
+ LOCKShift = FPTypeShift + 3,
+ LOCK = 1 << LOCKShift,
+
+ // Segment override prefixes. Currently we just need ability to address
+ // stuff in gs and fs segments.
+ SegOvrShift = LOCKShift + 1,
+ SegOvrMask = 3 << SegOvrShift,
+ FS = 1 << SegOvrShift,
+ GS = 2 << SegOvrShift,
+
+ // Execution domain for SSE instructions in bits 23, 24.
+ // 0 in bits 23-24 means normal, non-SSE instruction.
+ SSEDomainShift = SegOvrShift + 2,
+
+ OpcodeShift = SSEDomainShift + 2,
+
+ //===------------------------------------------------------------------===//
+ /// VEX - The opcode prefix used by AVX instructions
+ VEXShift = OpcodeShift + 8,
+ VEX = 1U << 0,
+
+ /// VEX_W - Has a opcode specific functionality, but is used in the same
+ /// way as REX_W is for regular SSE instructions.
+ VEX_W = 1U << 1,
+
+ /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+ /// address instructions in SSE are represented as 3 address ones in AVX
+ /// and the additional register is encoded in VEX_VVVV prefix.
+ VEX_4V = 1U << 2,
+
+ /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
+ /// operand 3 with VEX.vvvv.
+ VEX_4VOp3 = 1U << 3,
+
+ /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+ /// must be encoded in the i8 immediate field. This usually happens in
+ /// instructions with 4 operands.
+ VEX_I8IMM = 1U << 4,
+
+ /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+ /// instruction uses 256-bit wide registers. This is usually auto detected
+ /// if a VR256 register is used, but some AVX instructions also have this
+ /// field marked when using a f256 memory references.
+ VEX_L = 1U << 5,
+
+ // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+ // prefix. Usually used for scalar instructions. Needed by disassembler.
+ VEX_LIG = 1U << 6,
+
+ /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+ /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
+ /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+ /// storing a classifier in the imm8 field. To simplify our implementation,
+ /// we handle this by storeing the classifier in the opcode field and using
+ /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+ Has3DNow0F0FOpcode = 1U << 7,
+
+ /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
+ /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
+ MemOp4 = 1U << 8,
+
+ /// XOP - Opcode prefix used by XOP instructions.
+ XOP = 1U << 9
+
+ };
+
+ // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+ // specified machine instruction.
+ //
+ inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+ return TSFlags >> X86II::OpcodeShift;
+ }
+
+ inline bool hasImm(uint64_t TSFlags) {
+ return (TSFlags & X86II::ImmMask) != 0;
+ }
+
+ /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+ /// of the specified instruction.
+ inline unsigned getSizeOfImm(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8:
+ case X86II::Imm8PCRel: return 1;
+ case X86II::Imm16:
+ case X86II::Imm16PCRel: return 2;
+ case X86II::Imm32:
+ case X86II::Imm32PCRel: return 4;
+ case X86II::Imm64: return 8;
+ }
+ }
+
+ /// isImmPCRel - Return true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is pc relative.
+ inline unsigned isImmPCRel(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8PCRel:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32PCRel:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm16:
+ case X86II::Imm32:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// getOperandBias - compute any additional adjustment needed to
+ /// the offset to the start of the memory operand
+ /// in this instruction.
+ /// If this is a two-address instruction,skip one of the register operands.
+ /// FIXME: This should be handled during MCInst lowering.
+ inline int getOperandBias(const MCInstrDesc& Desc)
+ {
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+ return CurOp;
+ }
+
+ /// getMemoryOperandNo - The function returns the MCInst operand # for the
+ /// first field of the memory operand. If the instruction doesn't have a
+ /// memory operand, this returns -1.
+ ///
+ /// Note that this ignores tied operands. If there is a tied register which
+ /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+ /// counted as one operand.
+ ///
+ inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ // FIXME: Remove this form.
+ return -1;
+ default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+ case X86II::Pseudo:
+ case X86II::RawFrm:
+ case X86II::AddRegFrm:
+ case X86II::MRMDestReg:
+ case X86II::MRMSrcReg:
+ case X86II::RawFrmImm8:
+ case X86II::RawFrmImm16:
+ return -1;
+ case X86II::MRMDestMem:
+ return 0;
+ case X86II::MRMSrcMem: {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ unsigned FirstMemOp = 1;
+ if (HasVEX_4V)
+ ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+ if (HasMemOp4)
+ ++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
+
+ // FIXME: Maybe lea should have its own form? This is a horrible hack.
+ //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+ // Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+ return FirstMemOp;
+ }
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ return -1;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ unsigned FirstMemOp = 0;
+ if (HasVEX_4V)
+ ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+ return FirstMemOp;
+ }
+ case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
+ case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
+ case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_E8:
+ case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9:
+ case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+ case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+ case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+ case X86II::MRM_DF:
+ return -1;
+ }
+ }
+
+ /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+ /// higher) register? e.g. r8, xmm8, xmm13, etc.
+ inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ switch (RegNo) {
+ default: break;
+ case X86::R8: case X86::R9: case X86::R10: case X86::R11:
+ case X86::R12: case X86::R13: case X86::R14: case X86::R15:
+ case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D:
+ case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D:
+ case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W:
+ case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
+ case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
+ case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
+ case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
+ case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+ case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
+ case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+ case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
+ case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
+ return true;
+ }
+ return false;
+ }
+
+ inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+ }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 0000000..de80dd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,227 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+ class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+ X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+ virtual ~X86ELFObjectWriter();
+ protected:
+ virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsPCRel, bool IsRelocWithSymbol,
+ int64_t Addend) const;
+ };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+ uint16_t EMachine)
+ : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+ // Only i386 uses Rel instead of RelA.
+ /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel,
+ bool IsRelocWithSymbol,
+ int64_t Addend) const {
+ // determine the type of the relocation
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+ unsigned Type;
+ if (getEMachine() == ELF::EM_X86_64) {
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+
+ case FK_Data_8: Type = ELF::R_X86_64_PC64; break;
+ case FK_Data_4: Type = ELF::R_X86_64_PC32; break;
+ case FK_Data_2: Type = ELF::R_X86_64_PC16; break;
+
+ case FK_PCRel_8:
+ assert(Modifier == MCSymbolRefExpr::VK_None);
+ Type = ELF::R_X86_64_PC64;
+ break;
+ case X86::reloc_signed_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_riprel_4byte:
+ case FK_PCRel_4:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_X86_64_PC32;
+ break;
+ case MCSymbolRefExpr::VK_PLT:
+ Type = ELF::R_X86_64_PLT32;
+ break;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ Type = ELF::R_X86_64_GOTPCREL;
+ break;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ Type = ELF::R_X86_64_GOTTPOFF;
+ break;
+ case MCSymbolRefExpr::VK_TLSGD:
+ Type = ELF::R_X86_64_TLSGD;
+ break;
+ case MCSymbolRefExpr::VK_TLSLD:
+ Type = ELF::R_X86_64_TLSLD;
+ break;
+ }
+ break;
+ case FK_PCRel_2:
+ assert(Modifier == MCSymbolRefExpr::VK_None);
+ Type = ELF::R_X86_64_PC16;
+ break;
+ case FK_PCRel_1:
+ assert(Modifier == MCSymbolRefExpr::VK_None);
+ Type = ELF::R_X86_64_PC8;
+ break;
+ }
+ } else {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+ case FK_Data_8: Type = ELF::R_X86_64_64; break;
+ case X86::reloc_signed_4byte:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_X86_64_32S;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_X86_64_GOT32;
+ break;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ Type = ELF::R_X86_64_GOTPCREL;
+ break;
+ case MCSymbolRefExpr::VK_TPOFF:
+ Type = ELF::R_X86_64_TPOFF32;
+ break;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ Type = ELF::R_X86_64_DTPOFF32;
+ break;
+ }
+ break;
+ case FK_Data_4:
+ Type = ELF::R_X86_64_32;
+ break;
+ case FK_Data_2: Type = ELF::R_X86_64_16; break;
+ case FK_PCRel_1:
+ case FK_Data_1: Type = ELF::R_X86_64_8; break;
+ }
+ }
+ } else if (getEMachine() == ELF::EM_386) {
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+
+ case X86::reloc_global_offset_table:
+ Type = ELF::R_386_GOTPC;
+ break;
+
+ case X86::reloc_signed_4byte:
+ case FK_PCRel_4:
+ case FK_Data_4:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_386_PC32;
+ break;
+ case MCSymbolRefExpr::VK_PLT:
+ Type = ELF::R_386_PLT32;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+
+ case X86::reloc_global_offset_table:
+ Type = ELF::R_386_GOTPC;
+ break;
+
+ // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode
+ // instead?
+ case X86::reloc_signed_4byte:
+ case FK_PCRel_4:
+ case FK_Data_4:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_386_32;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_386_GOT32;
+ break;
+ case MCSymbolRefExpr::VK_GOTOFF:
+ Type = ELF::R_386_GOTOFF;
+ break;
+ case MCSymbolRefExpr::VK_TLSGD:
+ Type = ELF::R_386_TLS_GD;
+ break;
+ case MCSymbolRefExpr::VK_TPOFF:
+ Type = ELF::R_386_TLS_LE_32;
+ break;
+ case MCSymbolRefExpr::VK_INDNTPOFF:
+ Type = ELF::R_386_TLS_IE;
+ break;
+ case MCSymbolRefExpr::VK_NTPOFF:
+ Type = ELF::R_386_TLS_LE;
+ break;
+ case MCSymbolRefExpr::VK_GOTNTPOFF:
+ Type = ELF::R_386_TLS_GOTIE;
+ break;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ Type = ELF::R_386_TLS_LDM;
+ break;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ Type = ELF::R_386_TLS_LDO_32;
+ break;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ Type = ELF::R_386_TLS_IE_32;
+ break;
+ }
+ break;
+ case FK_Data_2: Type = ELF::R_386_16; break;
+ case FK_PCRel_1:
+ case FK_Data_1: Type = ELF::R_386_8; break;
+ }
+ }
+ } else
+ llvm_unreachable("Unsupported ELF machine type.");
+
+ return Type;
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS,
+ bool IsELF64,
+ uint8_t OSABI,
+ uint16_t EMachine) {
+ MCELFObjectTargetWriter *MOTW =
+ new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 0000000..f2e34cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,33 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_X86_X86FIXUPKINDS_H
+#define LLVM_X86_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+ reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+ reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq
+ reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4
+ // this will be sign extended at
+ // runtime.
+ reloc_global_offset_table, // 32-bit, relative to the start
+ // of the instruction. Used only
+ // for _GLOBAL_OFFSET_TABLE_.
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..7815ae9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,158 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+ clEnumValEnd));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+ cl::desc("Mark code section jump table data regions."),
+ cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ if (is64Bit)
+ PointerSize = CalleeSaveStackSlotSize = 8;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ if (!is64Bit)
+ Data64bitsDirective = 0; // we can't emit a 64-bit unit
+
+ // Use ## as a comment string so that .s files generated by llvm can go
+ // through the GCC preprocessor without causing an error. This is needed
+ // because "clang foo.s" runs the C preprocessor, which is usually reserved
+ // for .S files on other systems. Perhaps this is because the file system
+ // wasn't always case preserving or something.
+ CommentString = "##";
+ PCSymbol = ".";
+
+ SupportsDebugInformation = true;
+ DwarfUsesInlineInfoSection = true;
+ UseDataRegionDirectives = MarkedJTDataRegions;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+ : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+ // For ELF, x86-64 pointer size depends on the ABI.
+ // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+ // with the x32 ABI, pointer size remains the default 4.
+ PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+ // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+ CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+ PCSymbol = ".";
+
+ // Set up DWARF directives
+ HasLEB128 = true; // Target asm supports leb128 directives (little-endian)
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split
+ // into two .words.
+ if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
+ T.getArch() == Triple::x86)
+ Data64bitsDirective = 0;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ MCContext &Context = Streamer.getContext();
+ const MCExpr *Res =
+ MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+ const MCExpr *Four = MCConstantExpr::Create(4, Context);
+ return MCBinaryExpr::CreateAdd(Res, Four, Context);
+}
+
+const MCSection *X86ELFMCAsmInfo::
+getNonexecutableStackSection(MCContext &Ctx) const {
+ return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+ 0, SectionKind::getMetadata());
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+ if (Triple.getArch() == Triple::x86_64) {
+ GlobalPrefix = "";
+ PrivateGlobalPrefix = ".L";
+ }
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+ if (Triple.getArch() == Triple::x86_64) {
+ GlobalPrefix = "";
+ PrivateGlobalPrefix = ".L";
+ }
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 0000000..b6b70fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,58 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+ class Triple;
+
+ class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+ public:
+ explicit X86MCAsmInfoDarwin(const Triple &Triple);
+ };
+
+ struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+ explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+ virtual const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const;
+ };
+
+ class X86ELFMCAsmInfo : public MCAsmInfo {
+ virtual void anchor();
+ public:
+ explicit X86ELFMCAsmInfo(const Triple &Triple);
+ virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
+ };
+
+ class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ virtual void anchor();
+ public:
+ explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+ };
+
+ class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ virtual void anchor();
+ public:
+ explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+ };
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000..016af71
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1239 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+ X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+ void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+ const MCInstrInfo &MCII;
+ const MCSubtargetInfo &STI;
+ MCContext &Ctx;
+public:
+ X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+ MCContext &ctx)
+ : MCII(mcii), STI(sti), Ctx(ctx) {
+ }
+
+ ~X86MCCodeEmitter() {}
+
+ bool is64BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+ }
+
+ bool is32BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return (STI.getFeatureBits() & X86::Mode64Bit) == 0;
+ }
+
+ unsigned GetX86RegNum(const MCOperand &MO) const {
+ return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
+ }
+
+ // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+ // 0-7 and the difference between the 2 groups is given by the REX prefix.
+ // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+ // in 1's complement form, example:
+ //
+ // ModRM field => XMM9 => 1
+ // VEX.VVVV => XMM9 => ~9
+ //
+ // See table 4-35 of Intel AVX Programming Reference for details.
+ unsigned char getVEXRegisterEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ unsigned SrcReg = MI.getOperand(OpNum).getReg();
+ unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+ if (X86II::isX86_64ExtendedReg(SrcReg))
+ SrcRegNum |= 8;
+
+ // The registers represented through VEX_VVVV should
+ // be encoded in 1's complement form.
+ return (~SrcRegNum) & 0xf;
+ }
+
+ void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+ OS << (char)C;
+ ++CurByte;
+ }
+
+ void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+ raw_ostream &OS) const {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ EmitByte(Val & 255, CurByte, OS);
+ Val >>= 8;
+ }
+ }
+
+ void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+ unsigned ImmSize, MCFixupKind FixupKind,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ int ImmOffset = 0) const;
+
+ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+ }
+
+ void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+ unsigned &CurByte, raw_ostream &OS) const {
+ EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+ }
+
+ void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ unsigned &CurByte, raw_ostream &OS) const {
+ // SIB byte is in the same format as the ModRMByte.
+ EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+ }
+
+
+ void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const;
+
+ void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const;
+
+ void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ raw_ostream &OS) const;
+
+ void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ raw_ostream &OS) const;
+
+ void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new X86MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+ unsigned Size = X86II::getSizeOfImm(TSFlags);
+ bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+ return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+#endif
+
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+/// _GLOBAL_OFFSET_TABLE_ and if it is of the form
+/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+ GOT_None,
+ GOT_Normal,
+ GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+ const MCExpr *RHS = 0;
+ if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+ Expr = BE->getLHS();
+ RHS = BE->getRHS();
+ }
+
+ if (Expr->getKind() != MCExpr::SymbolRef)
+ return GOT_None;
+
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ const MCSymbol &S = Ref->getSymbol();
+ if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+ return GOT_None;
+ if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+ return GOT_SymDiff;
+ return GOT_Normal;
+}
+
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+ }
+ return false;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+ MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+ const MCExpr *Expr = NULL;
+ if (DispOp.isImm()) {
+ // If this is a simple integer displacement that doesn't require a
+ // relocation, emit it now.
+ if (FixupKind != FK_PCRel_1 &&
+ FixupKind != FK_PCRel_2 &&
+ FixupKind != FK_PCRel_4) {
+ EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+ return;
+ }
+ Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx);
+ } else {
+ Expr = DispOp.getExpr();
+ }
+
+ // If we have an immoffset, add it to the expression.
+ if ((FixupKind == FK_Data_4 ||
+ FixupKind == FK_Data_8 ||
+ FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+ GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+ if (Kind != GOT_None) {
+ assert(ImmOffset == 0);
+
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ if (Kind == GOT_Normal)
+ ImmOffset = CurByte;
+ } else if (Expr->getKind() == MCExpr::SymbolRef) {
+ if (HasSecRelSymbolRef(Expr)) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ } else if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+ if (HasSecRelSymbolRef(Bin->getLHS())
+ || HasSecRelSymbolRef(Bin->getRHS())) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ }
+ }
+
+ // If the fixup is pc-relative, we need to bias the value to be relative to
+ // the start of the field, not the end of the field.
+ if (FixupKind == FK_PCRel_4 ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+ ImmOffset -= 4;
+ if (FixupKind == FK_PCRel_2)
+ ImmOffset -= 2;
+ if (FixupKind == FK_PCRel_1)
+ ImmOffset -= 1;
+
+ if (ImmOffset)
+ Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
+ Ctx);
+
+ // Emit a symbolic constant as a fixup and 4 zeros.
+ Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc));
+ EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ uint64_t TSFlags, unsigned &CurByte,
+ raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const{
+ const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
+ const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg);
+ const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
+ const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+ unsigned BaseReg = Base.getReg();
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
+ assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode");
+ assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+ unsigned FixupKind = X86::reloc_riprel_4byte;
+
+ // movq loads are handled with a special relocation form which allows the
+ // linker to eliminate some loads for GOT references which end up in the
+ // same linkage unit.
+ if (MI.getOpcode() == X86::MOV64rm)
+ FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+ // rip-relative addressing is actually relative to the *next* instruction.
+ // Since an immediate can follow the mod/rm byte for an instruction, this
+ // means that we need to bias the immediate field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
+ // expression to emit.
+ int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+ CurByte, OS, Fixups, -ImmSize);
+ return;
+ }
+
+ unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+ // Determine whether a SIB byte is needed.
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+ // 2-7) and absolute references.
+
+ if (// The SIB byte must be used if there is an index register.
+ IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!is64BitMode() || BaseReg != 0)) {
+
+ if (BaseReg == 0) { // [disp32] in X86-32 mode
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+ return;
+ }
+
+ // If the base is not EBP/ESP and there is no displacement, use simple
+ // indirect register encoding, this handles addresses like [EAX]. The
+ // encoding for [EBP] with no displacement means [disp32] so we handle it
+ // by emitting a displacement of 0 below.
+ if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ if (Disp.isImm() && isDisp8(Disp.getImm())) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
+ Fixups);
+ return;
+ }
+
+ // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (!Disp.isImm()) {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (Disp.getImm() == 0 &&
+ // Base reg can't be anything that ends up with '5' as the base
+ // reg, it is the magic [*] nomenclature that indicates no base.
+ BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ } else if (isDisp8(Disp.getImm())) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base, see Intel
+ // Manual 2A, table 2-7. The displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+ IndexRegNo = 4;
+ EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+ } else {
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8)
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ else if (ForceDisp32 || Disp.getImm() != 0)
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ const MCInstrDesc &Desc,
+ raw_ostream &OS) const {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+
+ // VEX_R: opcode externsion equivalent to REX.R in
+ // 1's complement (inverted) form
+ //
+ // 1: Same as REX_R=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX_R=1 (64 bit mode only)
+ //
+ unsigned char VEX_R = 0x1;
+
+ // VEX_X: equivalent to REX.X, only used when a
+ // register is used for index in SIB Byte.
+ //
+ // 1: Same as REX.X=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX.X=1 (64-bit mode only)
+ unsigned char VEX_X = 0x1;
+
+ // VEX_B:
+ //
+ // 1: Same as REX_B=0 (ignored in 32-bit mode)
+ // 0: Same as REX_B=1 (64 bit mode only)
+ //
+ unsigned char VEX_B = 0x1;
+
+ // VEX_W: opcode specific (use like REX.W, or used for
+ // opcode extension, or ignored, depending on the opcode byte)
+ unsigned char VEX_W = 0;
+
+ // XOP: Use XOP prefix byte 0x8f instead of VEX.
+ unsigned char XOP = 0;
+
+ // VEX_5M (VEX m-mmmmm field):
+ //
+ // 0b00000: Reserved for future use
+ // 0b00001: implied 0F leading opcode
+ // 0b00010: implied 0F 38 leading opcode bytes
+ // 0b00011: implied 0F 3A leading opcode bytes
+ // 0b00100-0b11111: Reserved for future use
+ // 0b01000: XOP map select - 08h instructions with imm byte
+ // 0b10001: XOP map select - 09h instructions with no imm byte
+ unsigned char VEX_5M = 0x1;
+
+ // VEX_4V (VEX vvvv field): a register specifier
+ // (in 1's complement form) or 1111 if unused.
+ unsigned char VEX_4V = 0xf;
+
+ // VEX_L (Vector Length):
+ //
+ // 0: scalar or 128-bit vector
+ // 1: 256-bit vector
+ //
+ unsigned char VEX_L = 0;
+
+ // VEX_PP: opcode extension providing equivalent
+ // functionality of a SIMD prefix
+ //
+ // 0b00: None
+ // 0b01: 66
+ // 0b10: F3
+ // 0b11: F2
+ //
+ unsigned char VEX_PP = 0;
+
+ // Encode the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ VEX_PP = 0x01;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+ VEX_W = 1;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+ XOP = 1;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+ VEX_L = 1;
+
+ switch (TSFlags & X86II::Op0Mask) {
+ default: llvm_unreachable("Invalid prefix!");
+ case X86II::T8: // 0F 38
+ VEX_5M = 0x2;
+ break;
+ case X86II::TA: // 0F 3A
+ VEX_5M = 0x3;
+ break;
+ case X86II::T8XS: // F3 0F 38
+ VEX_PP = 0x2;
+ VEX_5M = 0x2;
+ break;
+ case X86II::T8XD: // F2 0F 38
+ VEX_PP = 0x3;
+ VEX_5M = 0x2;
+ break;
+ case X86II::TAXD: // F2 0F 3A
+ VEX_PP = 0x3;
+ VEX_5M = 0x3;
+ break;
+ case X86II::XS: // F3 0F
+ VEX_PP = 0x2;
+ break;
+ case X86II::XD: // F2 0F
+ VEX_PP = 0x3;
+ break;
+ case X86II::XOP8:
+ VEX_5M = 0x8;
+ break;
+ case X86II::XOP9:
+ VEX_5M = 0x9;
+ break;
+ case X86II::A6: // Bypass: Not used by VEX
+ case X86II::A7: // Bypass: Not used by VEX
+ case X86II::TB: // Bypass: Not used by VEX
+ case 0:
+ break; // No prefix!
+ }
+
+
+ // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
+ case X86II::MRMDestMem: {
+ // MRMDestMem instructions forms:
+ // MemAddr, src1(ModR/M)
+ // MemAddr, src1(VEX_4V), src2(ModR/M)
+ // MemAddr, src1(ModR/M), imm8
+ //
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+
+ CurOp = X86::AddrNumOperands;
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ const MCOperand &MO = MI.getOperand(CurOp);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ VEX_R = 0x0;
+ break;
+ }
+ case X86II::MRMSrcMem:
+ // MRMSrcMem instructions forms:
+ // src1(ModR/M), MemAddr
+ // src1(ModR/M), src2(VEX_4V), MemAddr
+ // src1(ModR/M), MemAddr, imm8
+ // src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
+ VEX_R = 0x0;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+
+ if (HasVEX_4VOp3)
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), MemAddr, src3(VEX_4V)
+ // CurOp points to start of the MemoryOperand,
+ // it skips TIED_TO operands if exist, then increments past src1.
+ // CurOp + X86::AddrNumOperands will point to src3.
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+ break;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ // MRM[0-9]m instructions forms:
+ // MemAddr
+ // src1(VEX_4V), MemAddr
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+ break;
+ }
+ case X86II::MRMSrcReg:
+ // MRMSrcReg instructions forms:
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M), src1(ModR/M)
+ // dst(ModR/M), src1(ModR/M), imm8
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ CurOp++;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ CurOp++;
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ CurOp++;
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ break;
+ case X86II::MRMDestReg:
+ // MRMDestReg instructions forms:
+ // dst(ModR/M), src(ModR/M)
+ // dst(ModR/M), src(ModR/M), imm8
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ CurOp++;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ break;
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ // MRM0r-MRM7r instructions forms:
+ // dst(VEX_4V), src(ModR/M), imm8
+ VEX_4V = getVEXRegisterEncoding(MI, 0);
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+ VEX_B = 0x0;
+ break;
+ default: // RawFrm
+ break;
+ }
+
+ // Emit segment override opcode prefix as needed.
+ EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+ if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+ EmitByte(0xC5, CurByte, OS);
+ EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ return;
+ }
+
+ // 3 byte VEX prefix
+ EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
+ EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+ EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+ const MCInstrDesc &Desc) {
+ unsigned REX = 0;
+ if (TSFlags & X86II::REX_W)
+ REX |= 1 << 3; // set REX.W
+
+ if (MI.getNumOperands() == 0) return REX;
+
+ unsigned NumOps = MI.getNumOperands();
+ // FIXME: MCInst should explicitize the two-addrness.
+ bool isTwoAddr = NumOps > 1 &&
+ Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (!MO.isReg()) continue;
+ unsigned Reg = MO.getReg();
+ if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
+ // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+ // that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ break;
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
+ case X86II::MRMSrcReg:
+ if (MI.getOperand(0).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 2; // set REX.R
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << 0; // set REX.B
+ }
+ break;
+ case X86II::MRMSrcMem: {
+ if (MI.getOperand(0).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 2; // set REX.R
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && MI.getOperand(e).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+ REX |= 1 << 2; // set REX.R
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
+ Bit++;
+ }
+ }
+ break;
+ }
+ default:
+ if (MI.getOperand(0).isReg() &&
+ X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 0; // set REX.B
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << 2; // set REX.R
+ }
+ break;
+ }
+ return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
+ unsigned &CurByte, int MemOperand,
+ const MCInst &MI,
+ raw_ostream &OS) const {
+ switch (TSFlags & X86II::SegOvrMask) {
+ default: llvm_unreachable("Invalid segment!");
+ case 0:
+ // No segment override, check for explicit one on memory operand.
+ if (MemOperand != -1) { // If the instruction has a memory operand.
+ switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+ default: llvm_unreachable("Unknown segment register!");
+ case 0: break;
+ case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+ case X86::SS: EmitByte(0x36, CurByte, OS); break;
+ case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+ case X86::ES: EmitByte(0x26, CurByte, OS); break;
+ case X86::FS: EmitByte(0x64, CurByte, OS); break;
+ case X86::GS: EmitByte(0x65, CurByte, OS); break;
+ }
+ }
+ break;
+ case X86II::FS:
+ EmitByte(0x64, CurByte, OS);
+ break;
+ case X86II::GS:
+ EmitByte(0x65, CurByte, OS);
+ break;
+ }
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present. If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+ int MemOperand, const MCInst &MI,
+ const MCInstrDesc &Desc,
+ raw_ostream &OS) const {
+
+ // Emit the lock opcode prefix as needed.
+ if (TSFlags & X86II::LOCK)
+ EmitByte(0xF0, CurByte, OS);
+
+ // Emit segment override opcode prefix as needed.
+ EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+ // Emit the repeat opcode prefix as needed.
+ if ((TSFlags & X86II::Op0Mask) == X86II::REP)
+ EmitByte(0xF3, CurByte, OS);
+
+ // Emit the address size opcode prefix as needed.
+ bool need_address_override;
+ if (TSFlags & X86II::AdSize) {
+ need_address_override = true;
+ } else if (MemOperand == -1) {
+ need_address_override = false;
+ } else if (is64BitMode()) {
+ assert(!Is16BitMemOperand(MI, MemOperand));
+ need_address_override = Is32BitMemOperand(MI, MemOperand);
+ } else if (is32BitMode()) {
+ assert(!Is64BitMemOperand(MI, MemOperand));
+ need_address_override = Is16BitMemOperand(MI, MemOperand);
+ } else {
+ need_address_override = false;
+ }
+
+ if (need_address_override)
+ EmitByte(0x67, CurByte, OS);
+
+ // Emit the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ EmitByte(0x66, CurByte, OS);
+
+ bool Need0FPrefix = false;
+ switch (TSFlags & X86II::Op0Mask) {
+ default: llvm_unreachable("Invalid prefix!");
+ case 0: break; // No prefix!
+ case X86II::REP: break; // already handled.
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ case X86II::A6: // 0F A6
+ case X86II::A7: // 0F A7
+ Need0FPrefix = true;
+ break;
+ case X86II::T8XS: // F3 0F 38
+ EmitByte(0xF3, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::T8XD: // F2 0F 38
+ EmitByte(0xF2, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::TAXD: // F2 0F 3A
+ EmitByte(0xF2, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::XS: // F3 0F
+ EmitByte(0xF3, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ EmitByte(0xF2, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
+ case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
+ case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
+ case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
+ case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
+ case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
+ case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
+ case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
+ }
+
+ // Handle REX prefix.
+ // FIXME: Can this come before F2 etc to simplify emission?
+ if (is64BitMode()) {
+ if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+ EmitByte(0x40 | REX, CurByte, OS);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ EmitByte(0x0F, CurByte, OS);
+
+ // FIXME: Pull this up into previous switch if REX can be moved earlier.
+ switch (TSFlags & X86II::Op0Mask) {
+ case X86II::T8XS: // F3 0F 38
+ case X86II::T8XD: // F2 0F 38
+ case X86II::T8: // 0F 38
+ EmitByte(0x38, CurByte, OS);
+ break;
+ case X86II::TAXD: // F2 0F 3A
+ case X86II::TA: // 0F 3A
+ EmitByte(0x3A, CurByte, OS);
+ break;
+ case X86II::A6: // 0F A6
+ EmitByte(0xA6, CurByte, OS);
+ break;
+ case X86II::A7: // 0F A7
+ EmitByte(0xA7, CurByte, OS);
+ break;
+ }
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Pseudo instructions don't get encoded.
+ if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return;
+
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // Keep track of the current byte being emitted.
+ unsigned CurByte = 0;
+
+ // Is this instruction encoded using the AVX VEX prefix?
+ bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+
+ // It uses the VEX.VVVV field?
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ const unsigned MemOp4_I8IMMOperand = 2;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+ if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+ if (!HasVEXPrefix)
+ EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+ else
+ EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+ unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+ BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+
+ unsigned SrcRegNum = 0;
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ llvm_unreachable("FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
+ default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+ llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+ case X86II::Pseudo:
+ llvm_unreachable("Pseudo instruction shouldn't be emitted");
+ case X86II::RawFrm:
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+ case X86II::RawFrmImm8:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+ OS, Fixups);
+ break;
+ case X86II::RawFrmImm16:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+ OS, Fixups);
+ break;
+
+ case X86II::AddRegFrm:
+ EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+ break;
+
+ case X86II::MRMDestReg:
+ EmitByte(BaseOpcode, CurByte, OS);
+ SrcRegNum = CurOp + 1;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ EmitRegModRMByte(MI.getOperand(CurOp),
+ GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ break;
+
+ case X86II::MRMDestMem:
+ EmitByte(BaseOpcode, CurByte, OS);
+ SrcRegNum = CurOp + X86::AddrNumOperands;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ EmitMemModRMByte(MI, CurOp,
+ GetX86RegNum(MI.getOperand(SrcRegNum)),
+ TSFlags, CurByte, OS, Fixups);
+ CurOp = SrcRegNum + 1;
+ break;
+
+ case X86II::MRMSrcReg:
+ EmitByte(BaseOpcode, CurByte, OS);
+ SrcRegNum = CurOp + 1;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+ ++SrcRegNum;
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+
+ // 2 operands skipped with HasMemOp4, compensate accordingly
+ CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
+ break;
+
+ case X86II::MRMSrcMem: {
+ int AddrOperands = X86::AddrNumOperands;
+ unsigned FirstMemOp = CurOp+1;
+ if (HasVEX_4V) {
+ ++AddrOperands;
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+ }
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ ++FirstMemOp;
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, CurByte, OS, Fixups);
+ CurOp += AddrOperands + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(CurOp++),
+ (TSFlags & X86II::FormMask)-X86II::MRM0r,
+ CurByte, OS);
+ break;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
+ TSFlags, CurByte, OS, Fixups);
+ CurOp += X86::AddrNumOperands;
+ break;
+ case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
+ case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
+ case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_D0:
+ case X86II::MRM_D1: case X86II::MRM_D4: case X86II::MRM_D5:
+ case X86II::MRM_D6: case X86II::MRM_D8: case X86II::MRM_D9:
+ case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
+ case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
+ case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8:
+ case X86II::MRM_F9:
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ unsigned char MRM;
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Invalid Form");
+ case X86II::MRM_C1: MRM = 0xC1; break;
+ case X86II::MRM_C2: MRM = 0xC2; break;
+ case X86II::MRM_C3: MRM = 0xC3; break;
+ case X86II::MRM_C4: MRM = 0xC4; break;
+ case X86II::MRM_C8: MRM = 0xC8; break;
+ case X86II::MRM_C9: MRM = 0xC9; break;
+ case X86II::MRM_CA: MRM = 0xCA; break;
+ case X86II::MRM_CB: MRM = 0xCB; break;
+ case X86II::MRM_D0: MRM = 0xD0; break;
+ case X86II::MRM_D1: MRM = 0xD1; break;
+ case X86II::MRM_D4: MRM = 0xD4; break;
+ case X86II::MRM_D5: MRM = 0xD5; break;
+ case X86II::MRM_D6: MRM = 0xD6; break;
+ case X86II::MRM_D8: MRM = 0xD8; break;
+ case X86II::MRM_D9: MRM = 0xD9; break;
+ case X86II::MRM_DA: MRM = 0xDA; break;
+ case X86II::MRM_DB: MRM = 0xDB; break;
+ case X86II::MRM_DC: MRM = 0xDC; break;
+ case X86II::MRM_DD: MRM = 0xDD; break;
+ case X86II::MRM_DE: MRM = 0xDE; break;
+ case X86II::MRM_DF: MRM = 0xDF; break;
+ case X86II::MRM_E8: MRM = 0xE8; break;
+ case X86II::MRM_F0: MRM = 0xF0; break;
+ case X86II::MRM_F8: MRM = 0xF8; break;
+ case X86II::MRM_F9: MRM = 0xF9; break;
+ }
+ EmitByte(MRM, CurByte, OS);
+ break;
+ }
+
+ // If there is a remaining operand, it must be a trailing immediate. Emit it
+ // according to the right size for the instruction. Some instructions
+ // (SSE4a extrq and insertq) have two trailing immediates.
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
+ // The last source register of a 4 operand instruction in AVX is encoded
+ // in bits[7:4] of a immediate byte.
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+ const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+ : CurOp);
+ ++CurOp;
+ unsigned RegNum = GetX86RegNum(MO) << 4;
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ RegNum |= 1 << 7;
+ // If there is an additional 5th operand it must be an immediate, which
+ // is encoded in bits[3:0]
+ if (CurOp != NumOps) {
+ const MCOperand &MIMM = MI.getOperand(CurOp++);
+ if (MIMM.isImm()) {
+ unsigned Val = MIMM.getImm();
+ assert(Val < 16 && "Immediate operand value out of range");
+ RegNum |= Val;
+ }
+ }
+ EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1,
+ CurByte, OS, Fixups);
+ } else {
+ unsigned FixupKind;
+ // FIXME: Is there a better way to know that we need a signed relocation?
+ if (MI.getOpcode() == X86::ADD64ri32 ||
+ MI.getOpcode() == X86::MOV64ri32 ||
+ MI.getOpcode() == X86::MOV64mi32 ||
+ MI.getOpcode() == X86::PUSH64i32)
+ FixupKind = X86::reloc_signed_4byte;
+ else
+ FixupKind = getImmFixupKind(TSFlags);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind),
+ CurByte, OS, Fixups);
+ }
+ }
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+ EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+ // FIXME: Verify.
+ if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+ errs() << "Cannot encode all operands of: ";
+ MI.dump();
+ errs() << '\n';
+ abort();
+ }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 0000000..5e84530
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,442 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+
+std::string X86_MC::ParseX86Triple(StringRef TT) {
+ Triple TheTriple(TT);
+ std::string FS;
+ if (TheTriple.getArch() == Triple::x86_64)
+ FS = "+64bit-mode";
+ else
+ FS = "-64bit-mode";
+ return FS;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments. If we can't run cpuid on the host, return true.
+bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+ unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+ #if defined(__GNUC__)
+ // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+ asm ("movq\t%%rbx, %%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx, %%rsi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+ #elif defined(_MSC_VER)
+ int registers[4];
+ __cpuid(registers, value);
+ *rEAX = registers[0];
+ *rEBX = registers[1];
+ *rECX = registers[2];
+ *rEDX = registers[3];
+ return false;
+ #else
+ return true;
+ #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+ #if defined(__GNUC__)
+ asm ("movl\t%%ebx, %%esi\n\t"
+ "cpuid\n\t"
+ "xchgl\t%%ebx, %%esi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+ #elif defined(_MSC_VER)
+ __asm {
+ mov eax,value
+ cpuid
+ mov esi,rEAX
+ mov dword ptr [esi],eax
+ mov esi,rEBX
+ mov dword ptr [esi],ebx
+ mov esi,rECX
+ mov dword ptr [esi],ecx
+ mov esi,rEDX
+ mov dword ptr [esi],edx
+ }
+ return false;
+ #else
+ return true;
+ #endif
+#else
+ return true;
+#endif
+}
+
+/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
+/// 4 values in the specified arguments. If we can't run cpuid on the host,
+/// return true.
+bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+ unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+ #if defined(__GNUC__)
+ // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually.
+ asm ("movq\t%%rbx, %%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx, %%rsi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value),
+ "c" (subleaf));
+ return false;
+ #elif defined(_MSC_VER)
+ // __cpuidex was added in MSVC++ 9.0 SP1
+ #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
+ int registers[4];
+ __cpuidex(registers, value, subleaf);
+ *rEAX = registers[0];
+ *rEBX = registers[1];
+ *rECX = registers[2];
+ *rEDX = registers[3];
+ return false;
+ #else
+ return true;
+ #endif
+ #else
+ return true;
+ #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+ #if defined(__GNUC__)
+ asm ("movl\t%%ebx, %%esi\n\t"
+ "cpuid\n\t"
+ "xchgl\t%%ebx, %%esi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value),
+ "c" (subleaf));
+ return false;
+ #elif defined(_MSC_VER)
+ __asm {
+ mov eax,value
+ mov ecx,subleaf
+ cpuid
+ mov esi,rEAX
+ mov dword ptr [esi],eax
+ mov esi,rEBX
+ mov dword ptr [esi],ebx
+ mov esi,rECX
+ mov dword ptr [esi],ecx
+ mov esi,rEDX
+ mov dword ptr [esi],edx
+ }
+ return false;
+ #else
+ return true;
+ #endif
+#else
+ return true;
+#endif
+}
+
+void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
+ unsigned &Model) {
+ Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+ Model = (EAX >> 4) & 0xf; // Bits 4 - 7
+ if (Family == 6 || Family == 0xf) {
+ if (Family == 0xf)
+ // Examine extended family ID if family ID is F.
+ Family += (EAX >> 20) & 0xff; // Bits 20 - 27
+ // Examine extended model ID if family ID is 6 or F.
+ Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+ }
+}
+
+unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
+ Triple TheTriple(TT);
+ if (TheTriple.getArch() == Triple::x86_64)
+ return DWARFFlavour::X86_64;
+
+ if (TheTriple.isOSDarwin())
+ return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+ if (TheTriple.getOS() == Triple::MinGW32 ||
+ TheTriple.getOS() == Triple::Cygwin)
+ // Unsupported by now, just quick fallback
+ return DWARFFlavour::X86_32_Generic;
+ return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+ // FIXME: TableGen these.
+ for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+ unsigned SEH = MRI->getEncodingValue(Reg);
+ MRI->mapLLVMRegToSEHReg(Reg, SEH);
+ }
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
+ StringRef FS) {
+ std::string ArchFS = X86_MC::ParseX86Triple(TT);
+ if (!FS.empty()) {
+ if (!ArchFS.empty())
+ ArchFS = ArchFS + "," + FS.str();
+ else
+ ArchFS = FS;
+ }
+
+ std::string CPUName = CPU;
+ if (CPUName.empty()) {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+ CPUName = sys::getHostCPUName();
+#else
+ CPUName = "generic";
+#endif
+ }
+
+ MCSubtargetInfo *X = new MCSubtargetInfo();
+ InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
+ return X;
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitX86MCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
+ Triple TheTriple(TT);
+ unsigned RA = (TheTriple.getArch() == Triple::x86_64)
+ ? X86::RIP // Should have dwarf #16.
+ : X86::EIP; // Should have dwarf #8.
+
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitX86MCRegisterInfo(X, RA,
+ X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true),
+ RA);
+ X86_MC::InitLLVM2SEHRegisterMapping(X);
+ return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
+ Triple TheTriple(TT);
+ bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+ if (is64Bit)
+ MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+ else
+ MAI = new X86MCAsmInfoDarwin(TheTriple);
+ } else if (TheTriple.getEnvironment() == Triple::ELF) {
+ // Force the use of an ELF container.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ } else if (TheTriple.getOS() == Triple::Win32) {
+ MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+ } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) {
+ MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+ } else {
+ // The default is ELF.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ }
+
+ // Initialize initial frame state.
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = is64Bit ? -8 : -4;
+
+ // Initial state of the frame pointer is esp+stackGrowth.
+ MachineLocation Dst(MachineLocation::VirtualFP);
+ MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+ MAI->addInitialFrameState(0, Dst, Src);
+
+ // Add return address to move list
+ MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+ MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
+ MAI->addInitialFrameState(0, CSDst, CSSrc);
+
+ return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL) {
+ MCCodeGenInfo *X = new MCCodeGenInfo();
+
+ Triple T(TT);
+ bool is64Bit = T.getArch() == Triple::x86_64;
+
+ if (RM == Reloc::Default) {
+ // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+ // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+ // use static relocation model by default.
+ if (T.isOSDarwin()) {
+ if (is64Bit)
+ RM = Reloc::PIC_;
+ else
+ RM = Reloc::DynamicNoPIC;
+ } else if (T.isOSWindows() && is64Bit)
+ RM = Reloc::PIC_;
+ else
+ RM = Reloc::Static;
+ }
+
+ // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
+ // is defined as a model for code which may be used in static or dynamic
+ // executables but not necessarily a shared library. On X86-32 we just
+ // compile in -static mode, in x86-64 we use PIC.
+ if (RM == Reloc::DynamicNoPIC) {
+ if (is64Bit)
+ RM = Reloc::PIC_;
+ else if (!T.isOSDarwin())
+ RM = Reloc::Static;
+ }
+
+ // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+ // the Mach-O file format doesn't support it.
+ if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+ RM = Reloc::PIC_;
+
+ // For static codegen, if we're not already set, use Small codegen.
+ if (CM == CodeModel::Default)
+ CM = CodeModel::Small;
+ else if (CM == CodeModel::JITDefault)
+ // 64-bit JIT places everything in the same buffer except external funcs.
+ CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+ X->InitMCCodeGenInfo(RM, CM, OL);
+ return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+ MCContext &Ctx, MCAsmBackend &MAB,
+ raw_ostream &_OS,
+ MCCodeEmitter *_Emitter,
+ bool RelaxAll,
+ bool NoExecStack) {
+ Triple TheTriple(TT);
+
+ if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+ return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
+
+ if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+ return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
+
+ return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI) {
+ if (SyntaxVariant == 0)
+ return new X86ATTInstPrinter(MAI, MII, MRI);
+ if (SyntaxVariant == 1)
+ return new X86IntelInstPrinter(MAI, MII, MRI);
+ return 0;
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+ return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo);
+ RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo);
+
+ // Register the MC codegen info.
+ RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo);
+ RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
+ TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
+ X86_MC::createX86MCSubtargetInfo);
+ TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
+ X86_MC::createX86MCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target,
+ createX86MCInstrAnalysis);
+ TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target,
+ createX86MCInstrAnalysis);
+
+ // Register the code emitter.
+ TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target,
+ createX86MCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target,
+ createX86MCCodeEmitter);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+ createX86_32AsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+ createX86_64AsmBackend);
+
+ // Register the object streamer.
+ TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target,
+ createMCStreamer);
+ TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target,
+ createMCStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,
+ createX86MCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
+ createX86MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 0000000..981aa1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,114 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MCTARGETDESC_H
+#define X86MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+ enum {
+ X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+ };
+}
+
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+namespace X86_MC {
+ std::string ParseX86Triple(StringRef TT);
+
+ /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+ /// the specified arguments. If we can't run cpuid on the host, return true.
+ bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+ unsigned *rEBX, unsigned *rECX, unsigned *rEDX);
+ /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
+ /// the 4 values in the specified arguments. If we can't run cpuid on the
+ /// host, return true.
+ bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+ unsigned *rEBX, unsigned *rECX, unsigned *rEDX);
+
+ void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
+
+ unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+
+ void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+ /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
+ /// This is exposed so Asm parser, etc. do not need to go through
+ /// TargetRegistry.
+ MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
+ StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+
+/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
+ bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
+
+/// createX86ELFObjectWriter - Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
+ bool IsELF64,
+ uint8_t OSABI,
+ uint16_t EMachine);
+/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 0000000..64f005c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,586 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+ bool RecordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue);
+ void RecordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+
+ void RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+ void RecordX86_64Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+public:
+ X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+ /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+ void RecordRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup,
+ MCValue Target, uint64_t &FixedValue) {
+ if (Writer->is64Bit())
+ RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ else
+ RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+ return Kind == X86::reloc_riprel_4byte ||
+ Kind == X86::reloc_riprel_4byte_movq_load;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_Data_1: return 0;
+ case FK_PCRel_2:
+ case FK_Data_2: return 1;
+ case FK_PCRel_4:
+ // FIXME: Remove these!!!
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case FK_Data_4: return 2;
+ case FK_Data_8: return 3;
+ }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // See <reloc.h>.
+ uint32_t FixupOffset =
+ Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ int64_t Value = 0;
+ unsigned Index = 0;
+ unsigned IsExtern = 0;
+ unsigned Type = 0;
+
+ Value = Target.getConstant();
+
+ if (IsPCRel) {
+ // Compensate for the relocation offset, Darwin x86_64 relocations only have
+ // the addend and appear to have attempted to define it to be the actual
+ // expression addend without the PCrel bias. However, instructions with data
+ // following the relocation are not accommodated for (see comment below
+ // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+ Value += 1LL << Log2Size;
+ }
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ Type = macho::RIT_X86_64_Unsigned;
+ Index = 0;
+
+ // FIXME: I believe this is broken, I don't think the linker can understand
+ // it. I think it would require a local relocation, but I'm not sure if that
+ // would work either. The official way to get an absolute PCrel relocation
+ // is to use an absolute symbol (which we don't support yet).
+ if (IsPCRel) {
+ IsExtern = 1;
+ Type = macho::RIT_X86_64_Branch;
+ }
+ } else if (Target.getSymB()) { // A - B + constant
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ MCSymbolData &A_SD = Asm.getSymbolData(*A);
+ const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+ const MCSymbol *B = &Target.getSymB()->getSymbol();
+ MCSymbolData &B_SD = Asm.getSymbolData(*B);
+ const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+ // Neither symbol can be modified.
+ if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+ report_fatal_error("unsupported relocation of modified symbol");
+
+ // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+ // implement most of these correctly.
+ if (IsPCRel)
+ report_fatal_error("unsupported pc-relative relocation of difference");
+
+ // The support for the situation where one or both of the symbols would
+ // require a local relocation is handled just like if the symbols were
+ // external. This is certainly used in the case of debug sections where the
+ // section has only temporary symbols and thus the symbols don't have base
+ // symbols. This is encoded using the section ordinal and non-extern
+ // relocation entries.
+
+ // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+ // single SIGNED relocation); reject it for now. Except the case where both
+ // symbols don't have a base, equal but both NULL.
+ if (A_Base == B_Base && A_Base)
+ report_fatal_error("unsupported relocation with identical base");
+
+ Value += Writer->getSymbolAddress(&A_SD, Layout) -
+ (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+ Value -= Writer->getSymbolAddress(&B_SD, Layout) -
+ (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+
+ if (A_Base) {
+ Index = A_Base->getIndex();
+ IsExtern = 1;
+ }
+ else {
+ Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
+ IsExtern = 0;
+ }
+ Type = macho::RIT_X86_64_Unsigned;
+
+ macho::RelocationEntry MRE;
+ MRE.Word0 = FixupOffset;
+ MRE.Word1 = ((Index << 0) |
+ (IsPCRel << 24) |
+ (Log2Size << 25) |
+ (IsExtern << 27) |
+ (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+
+ if (B_Base) {
+ Index = B_Base->getIndex();
+ IsExtern = 1;
+ }
+ else {
+ Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
+ IsExtern = 0;
+ }
+ Type = macho::RIT_X86_64_Subtractor;
+ } else {
+ const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+ MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+ const MCSymbolData *Base = Asm.getAtom(&SD);
+
+ // Relocations inside debug sections always use local relocations when
+ // possible. This seems to be done because the debugger doesn't fully
+ // understand x86_64 relocation entries, and expects to find values that
+ // have already been fixed up.
+ if (Symbol->isInSection()) {
+ const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
+ Fragment->getParent()->getSection());
+ if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG))
+ Base = 0;
+ }
+
+ // x86_64 almost always uses external relocations, except when there is no
+ // symbol to use as a base address (a local symbol with no preceding
+ // non-local symbol).
+ if (Base) {
+ Index = Base->getIndex();
+ IsExtern = 1;
+
+ // Add the local offset, if needed.
+ if (Base != &SD)
+ Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+ } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+ // The index is the section ordinal (1-based).
+ Index = SD.getFragment()->getParent()->getOrdinal() + 1;
+ IsExtern = 0;
+ Value += Writer->getSymbolAddress(&SD, Layout);
+
+ if (IsPCRel)
+ Value -= FixupAddress + (1 << Log2Size);
+ } else if (Symbol->isVariable()) {
+ const MCExpr *Value = Symbol->getVariableValue();
+ int64_t Res;
+ bool isAbs = Value->EvaluateAsAbsolute(Res, Layout,
+ Writer->getSectionAddressMap());
+ if (isAbs) {
+ FixedValue = Res;
+ return;
+ } else {
+ report_fatal_error("unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ }
+ } else {
+ report_fatal_error("unsupported relocation of undefined symbol '" +
+ Symbol->getName() + "'");
+ }
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+ if (IsPCRel) {
+ if (IsRIPRel) {
+ if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+ // rewrite the movq to an leaq at link time if the symbol ends up in
+ // the same linkage unit.
+ if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+ Type = macho::RIT_X86_64_GOTLoad;
+ else
+ Type = macho::RIT_X86_64_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Type = macho::RIT_X86_64_TLV;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ report_fatal_error("unsupported symbol modifier in relocation");
+ } else {
+ Type = macho::RIT_X86_64_Signed;
+
+ // The Darwin x86_64 relocation format has a problem where it cannot
+ // encode an address (L<foo> + <constant>) which is outside the atom
+ // containing L<foo>. Generally, this shouldn't occur but it does
+ // happen when we have a RIPrel instruction with data following the
+ // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+ // adjustment Darwin x86_64 uses, the offset is still negative and the
+ // linker has no way to recognize this.
+ //
+ // To work around this, Darwin uses several special relocation types
+ // to indicate the offsets. However, the specification or
+ // implementation of these seems to also be incomplete; they should
+ // adjust the addend as well based on the actual encoded instruction
+ // (the additional bias), but instead appear to just look at the final
+ // offset.
+ switch (-(Target.getConstant() + (1LL << Log2Size))) {
+ case 1: Type = macho::RIT_X86_64_Signed1; break;
+ case 2: Type = macho::RIT_X86_64_Signed2; break;
+ case 4: Type = macho::RIT_X86_64_Signed4; break;
+ }
+ }
+ } else {
+ if (Modifier != MCSymbolRefExpr::VK_None)
+ report_fatal_error("unsupported symbol modifier in branch "
+ "relocation");
+
+ Type = macho::RIT_X86_64_Branch;
+ }
+ } else {
+ if (Modifier == MCSymbolRefExpr::VK_GOT) {
+ Type = macho::RIT_X86_64_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+ // case all we do is set the PCrel bit in the relocation entry; this is
+ // used with exception handling, for example. The source is required to
+ // include any necessary offset directly.
+ Type = macho::RIT_X86_64_GOT;
+ IsPCRel = 1;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ report_fatal_error("TLVP symbol modifier should have been rip-rel");
+ } else if (Modifier != MCSymbolRefExpr::VK_None)
+ report_fatal_error("unsupported symbol modifier in relocation");
+ else
+ Type = macho::RIT_X86_64_Unsigned;
+ }
+ }
+
+ // x86_64 always writes custom values into the fixups.
+ FixedValue = Value;
+
+ // struct relocation_info (8 bytes)
+ macho::RelocationEntry MRE;
+ MRE.Word0 = FixupOffset;
+ MRE.Word1 = ((Index << 0) |
+ (IsPCRel << 24) |
+ (Log2Size << 25) |
+ (IsExtern << 27) |
+ (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue) {
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Type = macho::RIT_Vanilla;
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+ if (!A_SD->getFragment())
+ report_fatal_error("symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+
+ uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+ uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+ FixedValue += SecAddr;
+ uint32_t Value2 = 0;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+ if (!B_SD->getFragment())
+ report_fatal_error("symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+
+ // Select the appropriate difference relocation type.
+ //
+ // Note that there is no longer any semantic difference between these two
+ // relocation types from the linkers point of view, this is done solely for
+ // pedantic compatibility with 'as'.
+ Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
+ (unsigned)macho::RIT_Generic_LocalDifference;
+ Value2 = Writer->getSymbolAddress(B_SD, Layout);
+ FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+ }
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ if (Type == macho::RIT_Difference ||
+ Type == macho::RIT_Generic_LocalDifference) {
+ // If the offset is too large to fit in a scattered relocation,
+ // we're hosed. It's an unfortunate limitation of the MachO format.
+ if (FixupOffset > 0xffffff) {
+ char Buffer[32];
+ format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ Twine("Section too large, can't encode "
+ "r_address (") + Buffer +
+ ") into 24 bits of scattered "
+ "relocation entry.");
+ llvm_unreachable("fatal error returned?!");
+ }
+
+ macho::RelocationEntry MRE;
+ MRE.Word0 = ((0 << 0) |
+ (macho::RIT_Pair << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ macho::RF_Scattered);
+ MRE.Word1 = Value2;
+ Writer->addRelocation(Fragment->getParent(), MRE);
+ } else {
+ // If the offset is more than 24-bits, it won't fit in a scattered
+ // relocation offset field, so we fall back to using a non-scattered
+ // relocation. This is a bit risky, as if the offset reaches out of
+ // the block and the linker is doing scattered loading on this
+ // symbol, things can go badly.
+ //
+ // Required for 'as' compatibility.
+ if (FixupOffset > 0xffffff)
+ return false;
+ }
+
+ macho::RelocationEntry MRE;
+ MRE.Word0 = ((FixupOffset << 0) |
+ (Type << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ macho::RF_Scattered);
+ MRE.Word1 = Value;
+ Writer->addRelocation(Fragment->getParent(), MRE);
+ return true;
+}
+
+void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+ !is64Bit() &&
+ "Should only be called with a 32-bit TLVP relocation!");
+
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+ uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = 0;
+
+ // Get the symbol data.
+ MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+ unsigned Index = SD_A->getIndex();
+
+ // We're only going to have a second symbol in pic mode and it'll be a
+ // subtraction from the picbase. For 32-bit pic the addend is the difference
+ // between the picbase and the next address. For 32-bit static the addend is
+ // zero.
+ if (Target.getSymB()) {
+ // If this is a subtraction then we're pcrel.
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+ IsPCRel = 1;
+ FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
+ Target.getConstant());
+ FixedValue += 1ULL << Log2Size;
+ } else {
+ FixedValue = 0;
+ }
+
+ // struct relocation_info (8 bytes)
+ macho::RelocationEntry MRE;
+ MRE.Word0 = Value;
+ MRE.Word1 = ((Index << 0) |
+ (IsPCRel << 24) |
+ (Log2Size << 25) |
+ (1 << 27) | // Extern
+ (macho::RIT_Generic_TLV << 28)); // Type
+ Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // If this is a 32-bit TLVP reloc it's handled a bit differently.
+ if (Target.getSymA() &&
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+ RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ return;
+ }
+
+ // If this is a difference or a defined symbol plus an offset, then we need a
+ // scattered relocation entry. Differences always require scattered
+ // relocations.
+ if (Target.getSymB()) {
+ RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, Log2Size, FixedValue);
+ return;
+ }
+
+ // Get the symbol data, if any.
+ MCSymbolData *SD = 0;
+ if (Target.getSymA())
+ SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+ // If this is an internal relocation with an offset, it also needs a scattered
+ // relocation entry.
+ uint32_t Offset = Target.getConstant();
+ if (IsPCRel)
+ Offset += 1 << Log2Size;
+ // Try to record the scattered relocation if needed. Fall back to non
+ // scattered if necessary (see comments in RecordScatteredRelocation()
+ // for details).
+ if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) &&
+ RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, Log2Size, FixedValue))
+ return;
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned Index = 0;
+ unsigned IsExtern = 0;
+ unsigned Type = 0;
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ //
+ // FIXME: Currently, these are never generated (see code below). I cannot
+ // find a case where they are actually emitted.
+ Type = macho::RIT_Vanilla;
+ } else {
+ // Resolve constant variables.
+ if (SD->getSymbol().isVariable()) {
+ int64_t Res;
+ if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+
+ // Check whether we need an external or internal relocation.
+ if (Writer->doesSymbolRequireExternRelocation(SD)) {
+ IsExtern = 1;
+ Index = SD->getIndex();
+ // For external relocations, make sure to offset the fixup value to
+ // compensate for the addend of the symbol address, if it was
+ // undefined. This occurs with weak definitions, for example.
+ if (!SD->Symbol->isUndefined())
+ FixedValue -= Layout.getSymbolOffset(SD);
+ } else {
+ // The index is the section ordinal (1-based).
+ const MCSectionData &SymSD = Asm.getSectionData(
+ SD->getSymbol().getSection());
+ Index = SymSD.getOrdinal() + 1;
+ FixedValue += Writer->getSectionAddress(&SymSD);
+ }
+ if (IsPCRel)
+ FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+ Type = macho::RIT_Vanilla;
+ }
+
+ // struct relocation_info (8 bytes)
+ macho::RelocationEntry MRE;
+ MRE.Word0 = FixupOffset;
+ MRE.Word1 = ((Index << 0) |
+ (IsPCRel << 24) |
+ (Log2Size << 25) |
+ (IsExtern << 27) |
+ (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
+ bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+ CPUType,
+ CPUSubtype),
+ OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..ed64a32
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,79 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+ class MCObjectWriter;
+}
+
+namespace {
+ class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+ const bool Is64Bit;
+
+ public:
+ X86WinCOFFObjectWriter(bool Is64Bit_);
+ ~X86WinCOFFObjectWriter();
+
+ virtual unsigned getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection) const LLVM_OVERRIDE;
+ };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
+ : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 :
+ COFF::IMAGE_FILE_MACHINE_I386),
+ Is64Bit(Is64Bit_) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection) const {
+ unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB :
+ COFF::IMAGE_REL_I386_DIR32NB;
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
+ case FK_Data_8:
+ if (Is64Bit)
+ return COFF::IMAGE_REL_AMD64_ADDR64;
+ llvm_unreachable("unsupported relocation type");
+ case FK_SecRel_4:
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
+ bool Is64Bit) {
+ MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+ return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..815d235
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() {
+ RegisterTarget<Triple::x86, /*HasJIT=*/true>
+ X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+ RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+ Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 0000000..bbd4904
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,217 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ // Defaults the copying the dest value.
+ ShuffleMask.push_back(0);
+ ShuffleMask.push_back(1);
+ ShuffleMask.push_back(2);
+ ShuffleMask.push_back(3);
+
+ // Decode the immediate.
+ unsigned ZMask = Imm & 15;
+ unsigned CountD = (Imm >> 4) & 3;
+ unsigned CountS = (Imm >> 6) & 3;
+
+ // CountS selects which input element to use.
+ unsigned InVal = 4+CountS;
+ // CountD specifies which element of destination to update.
+ ShuffleMask[CountD] = InVal;
+ // ZMask zaps values, potentially overriding the CountD elt.
+ if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+ if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+ if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+ if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = NElts/2; i != NElts; ++i)
+ ShuffleMask.push_back(NElts+i);
+
+ for (unsigned i = NElts/2; i != NElts; ++i)
+ ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != NElts/2; ++i)
+ ShuffleMask.push_back(i);
+
+ for (unsigned i = 0; i != NElts/2; ++i)
+ ShuffleMask.push_back(NElts+i);
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + l);
+ NewImm /= NumLaneElts;
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + 4 + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ }
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ // each half of a lane comes from different source
+ for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+ for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+ NewImm /= NumLaneElts;
+ }
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i+NumElts); // Reads from src/src2
+ }
+ }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i+NumElts); // Reads from src/src2
+ }
+ }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Imm & 0x88)
+ return; // Not a shuffle
+
+ unsigned HalfSize = VT.getVectorNumElements()/2;
+
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+ for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+ ShuffleMask.push_back(i);
+ }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != 4; ++i) {
+ ShuffleMask.push_back((Imm >> (2*i)) & 3);
+ }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 0000000..017ab32
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,71 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_SHUFFLE_DECODE_H
+#define X86_SHUFFLE_DECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+enum {
+ SM_SentinelZero = -1
+};
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 0000000..947002f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,80 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class FunctionPass;
+class JITCodeEmitter;
+class X86TargetMachine;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+/// createGlobalBaseRegPass - This pass initializes a global base
+/// register for PIC on x86-32.
+FunctionPass* createGlobalBaseRegPass();
+
+/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
+/// to local-dynamic TLS variables so that the TLS base address for the module
+/// is only fetched once per execution path through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
+/// before each call to avoid transition penalty between functions encoded with
+/// AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+ JITCodeEmitter &JCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+/// \brief Creates an X86-specific Target Transformation Info pass.
+ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
+
+/// createX86PadShortFunctions - Return a pass that pads short functions
+/// with NOOPs. This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+/// createX86FixupLEAs - Return a a pass that selectively replaces
+/// certain instructions (like add, sub, inc, dec, some shifts,
+/// and some multiplies) by equivalent LEA instructions, in order
+/// to eliminate execution delays in some Atom processors.
+FunctionPass *createX86FixupLEAs();
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 0000000..c865500
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,357 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+ "64-bit mode (x86_64)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
+ "Enable conditional move instructions">;
+
+def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+ "Support POPCNT instruction">;
+
+
+def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+ "Enable MMX instructions">;
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions",
+ // SSE codegen depends on cmovs, and all
+ // SSE1+ processors support them.
+ [FeatureMMX, FeatureCMOV]>;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+ "Enable SSE 4.1 instructions",
+ [FeatureSSSE3]>;
+def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+ "Enable SSE 4.2 instructions",
+ [FeatureSSE41]>;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions",
+ [FeatureMMX]>;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions",
+ [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+ "64-bit with cmpxchg16b",
+ [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+ "Bit testing of memory is slow">;
+def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
+ "IsUAMemFast", "true",
+ "Fast unaligned memory access">;
+def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+ "Support SSE 4a instructions",
+ [FeatureSSE3]>;
+
+def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+ "Enable AVX instructions",
+ [FeatureSSE42]>;
+def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+ "Enable AVX2 instructions",
+ [FeatureAVX]>;
+def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+ "Enable packed carry-less multiplication instructions",
+ [FeatureSSE2]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
+ "Enable four-operand fused multiple-add",
+ [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
+ "Enable XOP instructions",
+ [FeatureFMA4]>;
+def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
+ "HasVectorUAMem", "true",
+ "Allow unaligned memory operands on vector/SIMD instructions">;
+def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
+ "Enable AES instructions",
+ [FeatureSSE2]>;
+def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
+ "Support MOVBE instruction">;
+def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+ "Support RDRAND instruction">;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions">;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+ "Support FS/GS Base instructions">;
+def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+ "Support LZCNT instruction">;
+def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
+ "Support BMI instructions">;
+def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
+ "Support BMI2 instructions">;
+def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
+ "Support RTM instructions">;
+def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true",
+ "Support HLE">;
+def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
+ "Support ADX instructions">;
+def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+ "Support PRFCHW instructions">;
+def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+ "Support RDSEED instruction">;
+def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+ "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+ "HasSlowDivide", "true",
+ "Use small divide for positive values less than 256">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+ "PadShortFunctions", "true",
+ "Pad short functions">;
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+ "CallRegIndirect", "true",
+ "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+ "LEA instruction needs inputs at AG stage">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+ "Intel Atom processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"i386", []>;
+def : Proc<"i486", []>;
+def : Proc<"i586", []>;
+def : Proc<"pentium", []>;
+def : Proc<"pentium-mmx", [FeatureMMX]>;
+def : Proc<"i686", []>;
+def : Proc<"pentiumpro", [FeatureCMOV]>;
+def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>;
+def : Proc<"pentium3", [FeatureSSE1]>;
+def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>;
+def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4", [FeatureSSE2]>;
+def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+ FeatureFastUAMem]>;
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+ [FeatureSSE3, FeatureSlowBTMem]>;
+
+// NetBurst.
+def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel,
+ [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : ProcessorModel<"penryn", SandyBridgeModel,
+ [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Atom.
+def : ProcessorModel<"atom", AtomModel,
+ [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
+ FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+ FeatureSlowDivide,
+ FeatureCallRegIndirect,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions]>;
+
+// "Arrandale" along with corei3 and corei5
+def : ProcessorModel<"corei7", SandyBridgeModel,
+ [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>;
+
+def : ProcessorModel<"nehalem", SandyBridgeModel,
+ [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureFastUAMem, FeaturePOPCNT]>;
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+def : ProcessorModel<"westmere", SandyBridgeModel,
+ [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureFastUAMem, FeaturePOPCNT, FeatureAES,
+ FeaturePCLMUL]>;
+// Sandy Bridge
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+def : ProcessorModel<"corei7-avx", SandyBridgeModel,
+ [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+// Ivy Bridge
+def : ProcessorModel<"core-avx-i", SandyBridgeModel,
+ [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+ FeatureF16C, FeatureFSGSBase]>;
+
+// Haswell
+def : ProcessorModel<"core-avx2", HaswellModel,
+ [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+ FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
+ FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
+ FeatureHLE]>;
+
+def : Proc<"k6", [FeatureMMX]>;
+def : Proc<"k6-2", [Feature3DNow]>;
+def : Proc<"k6-3", [Feature3DNow]>;
+def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
+ FeatureSlowBTMem]>;
+def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
+ FeatureSlowBTMem]>;
+def : Proc<"amdfam10", [FeatureSSE4A,
+ Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
+ FeaturePOPCNT, FeatureSlowBTMem]>;
+// Bobcat
+def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
+ FeatureLZCNT, FeaturePOPCNT]>;
+// Jaguar
+def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
+ FeatureAES, FeaturePCLMUL, FeatureBMI,
+ FeatureF16C, FeatureMOVBE, FeatureLZCNT,
+ FeaturePOPCNT]>;
+// Bulldozer
+def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+ FeatureAES, FeaturePCLMUL,
+ FeatureLZCNT, FeaturePOPCNT]>;
+// Piledriver
+def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+ FeatureAES, FeaturePCLMUL,
+ FeatureF16C, FeatureLZCNT,
+ FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+def : Proc<"geode", [Feature3DNowA]>;
+
+def : Proc<"winchip-c6", [FeatureMMX]>;
+def : Proc<"winchip2", [Feature3DNow]>;
+def : Proc<"c3", [Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParser : AsmParser {
+ string AsmParserClassName = "AsmParser";
+}
+
+def ATTAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Variant name.
+ string Name = "att";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = "#";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+
+ // Variant name.
+ string Name = "intel";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = ";";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTInstPrinter";
+ int Variant = 0;
+ bit isMCAsmWriter = 1;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelInstPrinter";
+ int Variant = 1;
+ bit isMCAsmWriter = 1;
+}
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+ let AssemblyParsers = [ATTAsmParser];
+ let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..6b228b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,755 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "X86.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ SetupMachineFunction(MF);
+
+ if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) {
+ bool Intrn = MF.getFunction()->hasInternalLinkage();
+ OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
+ : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+ OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+ << COFF::SCT_COMPLEX_TYPE_SHIFT);
+ OutStreamer.EndCOFFSymbolDef();
+ }
+
+ // Have common code print out the function header with linkage info etc.
+ EmitFunctionHeader();
+
+ // Emit the rest of the function body.
+ EmitFunctionBody();
+
+ // We didn't modify anything.
+ return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand. This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown symbol type!");
+ case MachineOperand::MO_JumpTableIndex:
+ O << *GetJTISymbol(MO.getIndex());
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << *GetCPISymbol(MO.getIndex());
+ printOffset(MO.getOffset(), O);
+ break;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+
+ MCSymbol *GVSym;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+ GVSym = GetSymbolWithGlobalValueBase(GV, "$stub");
+ else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+ GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ else
+ GVSym = Mang->getSymbol(GV);
+
+ // Handle dllimport linkage.
+ if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+ GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0)
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+ } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0)
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+ } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+ if (StubSym.getPointer() == 0)
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (GVSym->getName()[0] != '$')
+ O << *GVSym;
+ else
+ O << '(' << *GVSym << ')';
+ printOffset(MO.getOffset(), O);
+ break;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ const MCSymbol *SymToPrint;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+ SmallString<128> TempNameStr;
+ TempNameStr += StringRef(MO.getSymbolName());
+ TempNameStr += StringRef("$stub");
+
+ MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str());
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+ if (StubSym.getPointer() == 0) {
+ TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end());
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()),
+ true);
+ }
+ SymToPrint = StubSym.getPointer();
+ } else {
+ SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (SymToPrint->getName()[0] != '$')
+ O << *SymToPrint;
+ else
+ O << '(' << *SymToPrint << '(';
+ break;
+ }
+ }
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_DARWIN_STUB:
+ // These affect the name of the symbol, not any suffix.
+ break;
+ case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+ O << " + [.-" << *MF->getPICBaseSymbol() << ']';
+ break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ O << '-' << *MF->getPICBaseSymbol();
+ break;
+ case X86II::MO_TLSGD: O << "@TLSGD"; break;
+ case X86II::MO_TLSLD: O << "@TLSLD"; break;
+ case X86II::MO_TLSLDM: O << "@TLSLDM"; break;
+ case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
+ case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+ case X86II::MO_TPOFF: O << "@TPOFF"; break;
+ case X86II::MO_DTPOFF: O << "@DTPOFF"; break;
+ case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
+ case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+ case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
+ case X86II::MO_GOT: O << "@GOT"; break;
+ case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
+ case X86II::MO_PLT: O << "@PLT"; break;
+ case X86II::MO_TLVP: O << "@TLVP"; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ O << "@TLVP" << '-' << *MF->getPICBaseSymbol();
+ break;
+ case X86II::MO_SECREL: O << "@SECREL32"; break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Register:
+ // pc-relativeness was handled when computing the value in the reg.
+ printOperand(MI, OpNo, O);
+ return;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ O << *MO.getMBB()->getSymbol();
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ printSymbolOperand(MO, O);
+ return;
+ }
+}
+
+
+void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier,
+ unsigned AsmVariant) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ // FIXME: Enumerating AsmVariant, so we can remove magic number.
+ if (AsmVariant == 0) O << '%';
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+ MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+ ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+ Reg = getX86SubSuperRegister(Reg, VT);
+ }
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ if (AsmVariant == 0) O << '$';
+ O << MO.getImm();
+ return;
+
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol: {
+ if (AsmVariant == 0) O << '$';
+ printSymbolOperand(MO, O);
+ break;
+ }
+ }
+}
+
+void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(Op);
+ const MachineOperand &IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
+ // HasParenPart - True if we will print out the () part of the mem ref.
+ bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+ if (DispSpec.isImm()) {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || !HasParenPart)
+ O << DispVal;
+ } else {
+ assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
+ DispSpec.isJTI() || DispSpec.isSymbol());
+ printSymbolOperand(MI->getOperand(Op+3), O);
+ }
+
+ if (Modifier && strcmp(Modifier, "H") == 0)
+ O << "+8";
+
+ if (HasParenPart) {
+ assert(IndexReg.getReg() != X86::ESP &&
+ "X86 doesn't allow scaling by ESP");
+
+ O << '(';
+ if (HasBaseReg)
+ printOperand(MI, Op, O, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op+2, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O, const char *Modifier) {
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(Op+4);
+ if (Segment.getReg()) {
+ printOperand(MI, Op+4, O, Modifier);
+ O << ':';
+ }
+ printLeaMemReference(MI, Op, O, Modifier);
+}
+
+void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O, const char *Modifier,
+ unsigned AsmVariant){
+ const MachineOperand &BaseReg = MI->getOperand(Op);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+ const MachineOperand &SegReg = MI->getOperand(Op+4);
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ printOperand(MI, Op+4, O, Modifier, AsmVariant);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op, O, Modifier, AsmVariant);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+2, O, Modifier, AsmVariant);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ printOperand(MI, Op+3, O, Modifier, AsmVariant);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+ O << ']';
+}
+
+bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ break;
+ case 'q': // Print DImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i64);
+ break;
+ }
+
+ O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'a': // This is an address. Currently only 'i' and 'r' are expected.
+ if (MO.isImm()) {
+ O << MO.getImm();
+ return false;
+ }
+ if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
+ printSymbolOperand(MO, O);
+ if (Subtarget->isPICStyleRIPRel())
+ O << "(%rip)";
+ return false;
+ }
+ if (MO.isReg()) {
+ O << '(';
+ printOperand(MI, OpNo, O);
+ O << ')';
+ return false;
+ }
+ return true;
+
+ case 'c': // Don't print "$" before a global var name or constant.
+ if (MO.isImm())
+ O << MO.getImm();
+ else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol())
+ printSymbolOperand(MO, O);
+ else
+ printOperand(MI, OpNo, O);
+ return false;
+
+ case 'A': // Print '*' before a register (it must be a register)
+ if (MO.isReg()) {
+ O << '*';
+ printOperand(MI, OpNo, O);
+ return false;
+ }
+ return true;
+
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print DImode register
+ if (MO.isReg())
+ return printAsmMRegister(MO, ExtraCode[0], O);
+ printOperand(MI, OpNo, O);
+ return false;
+
+ case 'P': // This is the operand of a call, treat specially.
+ printPCRelImm(MI, OpNo, O);
+ return false;
+
+ case 'n': // Negate the immediate or print a '-' before the operand.
+ // Note: this is a temporary solution. It should be handled target
+ // independently as part of the 'MC' work.
+ if (MO.isImm()) {
+ O << -MO.getImm();
+ return false;
+ }
+ O << '-';
+ }
+ }
+
+ printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
+ return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (AsmVariant) {
+ printIntelMemReference(MI, OpNo, O);
+ return false;
+ }
+
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print SImode register
+ // These only apply to registers, ignore on mem.
+ break;
+ case 'H':
+ printMemReference(MI, OpNo, O, "H");
+ return false;
+ case 'P': // Don't print @PLT, but do print as memory.
+ printMemReference(MI, OpNo, O, "no-rip");
+ return false;
+ }
+ }
+ printMemReference(MI, OpNo, O);
+ return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (Subtarget->isTargetEnvMacho())
+ OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ if (Subtarget->isTargetEnvMacho()) {
+ // All darwin targets use mach-o.
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs;
+
+ Stubs = MMIMacho.GetFnStubList();
+ if (!Stubs.empty()) {
+ const MCSection *TheSection =
+ OutContext.getMachOSection("__IMPORT", "__jump_table",
+ MCSectionMachO::S_SYMBOL_STUBS |
+ MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE |
+ MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+ 5, SectionKind::getMetadata());
+ OutStreamer.SwitchSection(TheSection);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$stub:
+ OutStreamer.EmitLabel(Stubs[i].first);
+ // .indirect_symbol _foo
+ OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
+ MCSA_IndirectSymbol);
+ // hlt; hlt; hlt; hlt; hlt hlt = 0xf4.
+ const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
+ OutStreamer.EmitBytes(StringRef(HltInsts, 5));
+ }
+
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+
+ // Output stubs for external and common global variables.
+ Stubs = MMIMacho.GetGVStubList();
+ if (!Stubs.empty()) {
+ const MCSection *TheSection =
+ OutContext.getMachOSection("__IMPORT", "__pointers",
+ MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata());
+ OutStreamer.SwitchSection(TheSection);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$non_lazy_ptr:
+ OutStreamer.EmitLabel(Stubs[i].first);
+ // .indirect_symbol _foo
+ MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+ OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),
+ MCSA_IndirectSymbol);
+ // .long 0
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.EmitIntValue(0, 4/*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs. However, sometimes the types are local to the file. So
+ // we need to fill in the value for the NLP in those cases.
+ OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+ OutContext), 4/*size*/);
+ }
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+
+ Stubs = MMIMacho.GetHiddenGVStubList();
+ if (!Stubs.empty()) {
+ OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+ EmitAlignment(2);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$non_lazy_ptr:
+ OutStreamer.EmitLabel(Stubs[i].first);
+ // .long _foo
+ OutStreamer.EmitValue(MCSymbolRefExpr::
+ Create(Stubs[i].second.getPointer(),
+ OutContext), 4/*size*/);
+ }
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ }
+
+ if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() &&
+ MMI->usesVAFloatArgument()) {
+ StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused";
+ MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName);
+ OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+ }
+
+ if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) {
+ X86COFFMachineModuleInfo &COFFMMI =
+ MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+
+ // Emit type information for external functions
+ typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator;
+ for (externals_iterator I = COFFMMI.externals_begin(),
+ E = COFFMMI.externals_end();
+ I != E; ++I) {
+ OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
+ OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+ << COFF::SCT_COMPLEX_TYPE_SHIFT);
+ OutStreamer.EndCOFFSymbolDef();
+ }
+
+ // Necessary for dllexport support
+ std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
+
+ const TargetLoweringObjectFileCOFF &TLOFCOFF =
+ static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+ for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+ if (I->hasDLLExportLinkage())
+ DLLExportedFns.push_back(Mang->getSymbol(I));
+
+ for (Module::const_global_iterator I = M.global_begin(),
+ E = M.global_end(); I != E; ++I)
+ if (I->hasDLLExportLinkage())
+ DLLExportedGlobals.push_back(Mang->getSymbol(I));
+
+ // Output linker support code for dllexported globals on windows.
+ if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+ OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
+ SmallString<128> name;
+ for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
+ if (Subtarget->isTargetWindows())
+ name = " /EXPORT:";
+ else
+ name = " -export:";
+ name += DLLExportedGlobals[i]->getName();
+ if (Subtarget->isTargetWindows())
+ name += ",DATA";
+ else
+ name += ",data";
+ OutStreamer.EmitBytes(name);
+ }
+
+ for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
+ if (Subtarget->isTargetWindows())
+ name = " /EXPORT:";
+ else
+ name = " -export:";
+ name += DLLExportedFns[i]->getName();
+ OutStreamer.EmitBytes(name);
+ }
+ }
+ }
+
+ if (Subtarget->isTargetELF()) {
+ const TargetLoweringObjectFileELF &TLOFELF =
+ static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+ MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+ // Output stubs for external and common global variables.
+ MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+ if (!Stubs.empty()) {
+ OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+ const DataLayout *TD = TM.getDataLayout();
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ OutStreamer.EmitLabel(Stubs[i].first);
+ OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+ TD->getPointerSize());
+ }
+ Stubs.clear();
+ }
+ }
+}
+
+MachineLocation
+X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+ MachineLocation Location;
+ assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
+ // Frame address. Currently handles register +- offset only.
+
+ if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
+ Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
+ else {
+ DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+ }
+ return Location;
+}
+
+void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+ raw_ostream &O) {
+ // Only the target-dependent form of DBG_VALUE should get here.
+ // Referencing the offset and metadata as NOps-2 and NOps-1 is
+ // probably portable to other targets; frame pointer location is not.
+ unsigned NOps = MI->getNumOperands();
+ assert(NOps==7);
+ O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+ // cast away const; DIetc do not take const operands for some reason.
+ DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+ if (V.getContext().isSubprogram())
+ O << DISubprogram(V.getContext()).getDisplayName() << ":";
+ O << V.getName();
+ O << " <- ";
+ // Frame address. Currently handles register +- offset only.
+ O << '[';
+ if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
+ printOperand(MI, 0, O);
+ else
+ O << "undef";
+ O << '+'; printOperand(MI, 3, O);
+ O << ']';
+ O << "+";
+ printOperand(MI, NOps-2, O);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+ RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+ RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 0000000..bc7496b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,79 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCStreamer;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+ const X86Subtarget *Subtarget;
+ public:
+ explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+ : AsmPrinter(TM, Streamer) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ }
+
+ virtual const char *getPassName() const LLVM_OVERRIDE {
+ return "X86 Assembly / Object Emitter";
+ }
+
+ const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+ virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
+
+ virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
+
+ virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
+
+ void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
+
+ // These methods are used by the tablegen'erated instruction printer.
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = 0, unsigned AsmVariant = 0);
+ void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+ bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+ virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) LLVM_OVERRIDE;
+ virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) LLVM_OVERRIDE;
+
+ void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
+ const char *Modifier=NULL);
+ void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
+ const char *Modifier=NULL);
+
+ void printIntelMemReference(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O, const char *Modifier=NULL,
+ unsigned AsmVariant = 1);
+
+ virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
+
+ void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+ virtual MachineLocation
+ getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp
new file mode 100644
index 0000000..6a6125b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp
@@ -0,0 +1,19 @@
+//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86COFFMachineModuleInfo.h"
+using namespace llvm;
+
+
+X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
+}
diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h
new file mode 100644
index 0000000..0dfeb42
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -0,0 +1,46 @@
+//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_MACHINEMODULEINFO_H
+#define X86COFF_MACHINEMODULEINFO_H
+
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+namespace llvm {
+ class X86MachineFunctionInfo;
+ class DataLayout;
+
+/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
+/// for X86 COFF targets.
+class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
+ DenseSet<MCSymbol const *> Externals;
+public:
+ X86COFFMachineModuleInfo(const MachineModuleInfo &) {}
+ virtual ~X86COFFMachineModuleInfo();
+
+ void addExternalFunction(MCSymbol* Symbol) {
+ Externals.insert(Symbol);
+ }
+
+ typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
+ externals_iterator externals_begin() const { return Externals.begin(); }
+ externals_iterator externals_end() const { return Externals.end(); }
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..9eafbd5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,537 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX. For i8, the ABI
+ // requires the values to be in AL and AH, however this code uses AL and DL
+ // instead. This is because using AH for the second register conflicts with
+ // the way LLVM does multiple return values -- a return of {i16,i8} would end
+ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+ // for functions that return two i8 values are currently expected to pack the
+ // values into an i16 (which uses AX, and thus AL:AH).
+ //
+ // For code that doesn't care about the ABI, we allow returning more than two
+ // integer values in registers.
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3
+ // can only be used by ABI non-compliant code. If the target doesn't have XMM
+ // registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX target feature.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+ // Long double types are always returned in ST0 (even with SSE).
+ CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in ST0, unless marked
+ // with "inreg" (used here to distinguish one kind of reg from another,
+ // weirdly; this is really the sse-regparm calling convention) in which
+ // case they use XMM0, otherwise it is the same as the common X86 calling
+ // conv.
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
+ CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+ CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+ // SSE2.
+ // This can happen when a float, 2 x float, or 3 x float vector is split by
+ // target lowering, and is returned in 1-3 sse regs.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+ // For integers, ECX can be used as an extra return register
+ CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+ // Otherwise, it is the same as the common X86 calling convention.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit FP vectors
+ // No more than 4 registers
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // i32, i64 in the standard way
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+ // Promote all types to i32
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // MMX vector types are always returned in XMM0.
+ CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+ // The X86-Win64 calling convention always returns __m64 values in RAX.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // Otherwise, everything is the same as 'normal' X86-64 C CC.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // If HiPE, use RetCC_X86_32_HiPE.
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // HiPE uses RetCC_X86_64_HiPE
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+ // Check if this is the Intel OpenCL built-ins calling convention
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+ CCIfType<[x86mmx],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCIfSubtarget<"hasSSE2()",
+ CCPromoteToType<v2i64>>>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 256-bit vector arguments are passed in YMM registers, unless
+ // this is a vararg function.
+ // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+ // fixed arguments to vararg functions are supposed to be passed in
+ // registers. Actually modeling that would be a lot of work, though.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+ YMM4, YMM5, YMM6, YMM7]>>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+ // FIXME: Handle byval stuff.
+ // FIXME: Handle varargs.
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // 128 bit vectors are passed by pointer
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+ // 256 bit vectors are passed by pointer
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+ // The first 4 MMX vector arguments are passed in GPRs.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Do not pass the sret argument in RCX, the Win64 thiscall calling
+ // convention requires "this" to be passed in RCX.
+ CCIfCC<"CallingConv::X86_ThisCall",
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ],
+ [XMM1, XMM2, XMM3]>>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_64_GHC : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+ CCIfType<[i64],
+ CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+ // Pass in STG registers: F1, F2, F3, F4, D1, D2
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // The first 3 float or double arguments, if marked 'inreg' and if the call
+ // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+ // The first 3 __m64 vector arguments are passed in mmx registers if the
+ // call is not a vararg call.
+ CCIfNotVarArg<CCIfType<[x86mmx],
+ CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // Long doubles get slots whose size depends on the subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+ // The first 4 SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+ // The first 4 AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass sret arguments indirectly through stack.
+ CCIfSRet<CCAssignToStack<4, 4>>,
+
+ // The first integer argument is passed in ECX
+ CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+ // Handles byval parameters. Note that we can't rely on the delegation
+ // to CC_X86_32_Common for this because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // The first 3 float or double arguments, if the call is not a vararg
+ // call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+ // Doubles get 8-byte slots that are 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1
+ CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+ CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+ CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>,
+
+ CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+ CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+ CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The 256-bit vector arguments are passed in YMM registers.
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+ CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+
+ // Otherwise, drop to normal X86-32 CC
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+ CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+ (sequence "XMM%u", 6, 15))>;
+
+def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+ R11, R12, R13, R14, R15, RBP,
+ (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+ R13, R14, R15,
+ (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
+ (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
+ (sequence "YMM%u", 8, 15))>;
diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..8fea6ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,1507 @@
+//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+ template<class CodeEmitter>
+ class Emitter : public MachineFunctionPass {
+ const X86InstrInfo *II;
+ const DataLayout *TD;
+ X86TargetMachine &TM;
+ CodeEmitter &MCE;
+ MachineModuleInfo *MMI;
+ intptr_t PICBaseOffset;
+ bool Is64BitMode;
+ bool IsPIC;
+ public:
+ static char ID;
+ explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+ : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
+ MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+ Emitter(X86TargetMachine &tm, CodeEmitter &mce,
+ const X86InstrInfo &ii, const DataLayout &td, bool is64)
+ : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
+ MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+ bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 Machine Code Emitter";
+ }
+
+ void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const;
+
+ void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const;
+
+ void emitSegmentOverridePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI) const;
+
+ void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineModuleInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+ void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+ intptr_t Disp = 0, intptr_t PCAdj = 0,
+ bool Indirect = false);
+ void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+ void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+ intptr_t PCAdj = 0);
+ void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ intptr_t PCAdj = 0);
+
+ void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+ intptr_t Adj = 0, bool IsPCRel = true);
+
+ void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+ void emitRegModRMByte(unsigned RegOpcodeField);
+ void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+ void emitConstant(uint64_t Val, unsigned Size);
+
+ void emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op, unsigned RegOpcodeField,
+ intptr_t PCAdj = 0);
+
+ unsigned getX86RegNum(unsigned RegNo) const {
+ const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+ return TRI->getEncodingValue(RegNo) & 0x7;
+ }
+
+ unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+ unsigned OpNum) const;
+ };
+
+template<class CodeEmitter>
+ char Emitter<CodeEmitter>::ID = 0;
+} // end anonymous namespace.
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified JITCodeEmitter object.
+FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
+ JITCodeEmitter &JCE) {
+ return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+ MMI = &getAnalysis<MachineModuleInfo>();
+ MCE.setModuleInfo(MMI);
+
+ II = TM.getInstrInfo();
+ TD = TM.getDataLayout();
+ Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+ IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+
+ do {
+ DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
+ MCE.startFunction(MF);
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ MBB != E; ++MBB) {
+ MCE.StartMachineBasicBlock(MBB);
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+ const MCInstrDesc &Desc = I->getDesc();
+ emitInstruction(*I, &Desc);
+ // MOVPC32r is basically a call plus a pop instruction.
+ if (Desc.getOpcode() == X86::MOVPC32r)
+ emitInstruction(*I, &II->get(X86::POP32r));
+ ++NumEmitted; // Keep track of the # of mi's emitted
+ }
+ }
+ } while (MCE.finishFunction(MF));
+
+ return false;
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned determineREX(const MachineInstr &MI) {
+ unsigned REX = 0;
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ // Pseudo instructions do not need REX prefix byte.
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return 0;
+ if (Desc.TSFlags & X86II::REX_W)
+ REX |= 1 << 3;
+
+ unsigned NumOps = Desc.getNumOperands();
+ if (NumOps) {
+ bool isTwoAddr = NumOps > 1 &&
+ Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ REX |= 0x40;
+ }
+ }
+
+ switch (Desc.TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= (1 << 0) | (1 << 2);
+ break;
+ case X86II::MRMSrcReg: {
+ if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (X86InstrInfo::isX86_64ExtendedReg(MO))
+ REX |= 1 << 0;
+ }
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86InstrInfo::isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86InstrInfo::isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ default: {
+ if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 0;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (X86InstrInfo::isX86_64ExtendedReg(MO))
+ REX |= 1 << 2;
+ }
+ break;
+ }
+ }
+ }
+ return REX;
+}
+
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+ // Remember where this reference was and where it is to so we can
+ // deal with it later.
+ MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+ X86::reloc_pcrel_word, MBB));
+ MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV,
+ unsigned Reloc,
+ intptr_t Disp /* = 0 */,
+ intptr_t PCAdj /* = 0 */,
+ bool Indirect /* = false */) {
+ intptr_t RelocCST = Disp;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MachineRelocation MR = Indirect
+ ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+ const_cast<GlobalValue *>(GV),
+ RelocCST, false)
+ : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+ const_cast<GlobalValue *>(GV), RelocCST, false);
+ MCE.addRelocation(MR);
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(Disp);
+ else
+ MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+ unsigned Reloc) {
+ intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+
+ // X86 never needs stubs because instruction selection will always pick
+ // an instruction sequence that is large enough to hold any address
+ // to a symbol.
+ // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall)
+ bool NeedStub = false;
+ MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+ Reloc, ES, RelocCST,
+ 0, NeedStub));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(0);
+ else
+ MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+ intptr_t Disp /* = 0 */,
+ intptr_t PCAdj /* = 0 */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+ Reloc, CPI, RelocCST));
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(Disp);
+ else
+ MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ intptr_t PCAdj /* = 0 */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+ Reloc, JTI, RelocCST));
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(0);
+ else
+ MCE.emitWordLE(0);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+ unsigned RegOpcodeFld){
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
+ unsigned Index,
+ unsigned Base) {
+ // SIB byte is in the same format as the ModRMByte...
+ MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+ // Output the constant in little endian byte order...
+ for (unsigned i = 0; i != Size; ++i) {
+ MCE.emitByte(Val & 255);
+ Val >>= 8;
+ }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
+ const TargetMachine &TM) {
+ // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
+ // mechanism as 32-bit mode.
+ if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
+ return false;
+
+ // Return true if this is a reference to a stub containing the address of the
+ // global, not the global itself.
+ return isGlobalStubReference(GVOp.getTargetFlags());
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+ int DispVal,
+ intptr_t Adj /* = 0 */,
+ bool IsPCRel /* = true */) {
+ // If this is a simple integer displacement that doesn't require a relocation,
+ // emit it now.
+ if (!RelocOp) {
+ emitConstant(DispVal, 4);
+ return;
+ }
+
+ // Otherwise, this is something that requires a relocation. Emit it as such
+ // now.
+ unsigned RelocType = Is64BitMode ?
+ (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (RelocOp->isGlobal()) {
+ // In 64-bit static small code model, we could potentially emit absolute.
+ // But it's probably not beneficial. If the MCE supports using RIP directly
+ // do it, otherwise fallback to absolute (this is determined by IsPCRel).
+ // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative
+ // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute
+ bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
+ emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
+ Adj, Indirect);
+ } else if (RelocOp->isSymbol()) {
+ emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
+ } else if (RelocOp->isCPI()) {
+ emitConstPoolAddress(RelocOp->getIndex(), RelocType,
+ RelocOp->getOffset(), Adj);
+ } else {
+ assert(RelocOp->isJTI() && "Unexpected machine operand!");
+ emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op,unsigned RegOpcodeField,
+ intptr_t PCAdj) {
+ const MachineOperand &Op3 = MI.getOperand(Op+3);
+ int DispVal = 0;
+ const MachineOperand *DispForReloc = 0;
+
+ // Figure out what sort of displacement we have to handle here.
+ if (Op3.isGlobal()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isSymbol()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isCPI()) {
+ if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+ DispVal += Op3.getOffset();
+ }
+ } else if (Op3.isJTI()) {
+ if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+ }
+ } else {
+ DispVal = Op3.getImm();
+ }
+
+ const MachineOperand &Base = MI.getOperand(Op);
+ const MachineOperand &Scale = MI.getOperand(Op+1);
+ const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+ unsigned BaseReg = Base.getReg();
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP ||
+ (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
+ assert(IndexReg.getReg() == 0 && Is64BitMode &&
+ "Invalid rip-relative address");
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+ return;
+ }
+
+ // Indicate that the displacement will use an pcrel or absolute reference
+ // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
+ // while others, unless explicit asked to use RIP, use absolute references.
+ bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
+
+ // Is a SIB byte needed?
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+ // 2-7) and absolute references.
+ unsigned BaseRegNo = -1U;
+ if (BaseReg != 0 && BaseReg != X86::RIP)
+ BaseRegNo = getX86RegNum(BaseReg);
+
+ if (// The SIB byte must be used if there is an index register.
+ IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!Is64BitMode || BaseReg != 0)) {
+ if (BaseReg == 0 || // [disp32] in X86-32 mode
+ BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+ return;
+ }
+
+ // If the base is not EBP/ESP and there is no displacement, use simple
+ // indirect register encoding, this handles addresses like [EAX]. The
+ // encoding for [EBP] with no displacement means [disp32] so we handle it
+ // by emitting a displacement of 0 below.
+ if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+ return;
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ if (!DispForReloc && isDisp8(DispVal)) {
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+ emitConstant(DispVal, 1);
+ return;
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+ return;
+ }
+
+ // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=4, to JUST get the index, scale, and displacement.
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispForReloc) {
+ // Emit the normal disp32 encoding.
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispVal == 0 && BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ } else if (isDisp8(DispVal)) {
+ // Emit the disp8 encoding...
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else {
+ // Emit the normal disp32 encoding...
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base, see Intel
+ // Manual 2A, table 2-7. The displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+ IndexRegNo = 4;
+ emitSIBByte(SS, IndexRegNo, 5);
+ } else {
+ unsigned BaseRegNo = getX86RegNum(BaseReg);
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ emitSIBByte(SS, IndexRegNo, BaseRegNo);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8) {
+ emitConstant(DispVal, 1);
+ } else if (DispVal != 0 || ForceDisp32) {
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+ }
+}
+
+static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
+ unsigned Opcode) {
+ const MCInstrDesc *Desc = &II->get(Opcode);
+ MI.setDesc(*Desc);
+ return Desc;
+}
+
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
+ const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
+ const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
+ const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+#endif
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const {
+ // Emit the lock opcode prefix as needed.
+ if (Desc->TSFlags & X86II::LOCK)
+ MCE.emitByte(0xF0);
+
+ // Emit segment override opcode prefix as needed.
+ emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+ // Emit the repeat opcode prefix as needed.
+ if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
+ MCE.emitByte(0xF3);
+
+ // Emit the address size opcode prefix as needed.
+ bool need_address_override;
+ if (TSFlags & X86II::AdSize) {
+ need_address_override = true;
+ } else if (MemOperand == -1) {
+ need_address_override = false;
+ } else if (Is64BitMode) {
+ assert(!Is16BitMemOperand(MI, MemOperand));
+ need_address_override = Is32BitMemOperand(MI, MemOperand);
+ } else {
+ assert(!Is64BitMemOperand(MI, MemOperand));
+ need_address_override = Is16BitMemOperand(MI, MemOperand);
+ }
+
+ if (need_address_override)
+ MCE.emitByte(0x67);
+
+ // Emit the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ MCE.emitByte(0x66);
+
+ bool Need0FPrefix = false;
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ case X86II::A6: // 0F A6
+ case X86II::A7: // 0F A7
+ Need0FPrefix = true;
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::T8XS: // F3 0F 38
+ case X86II::XS: // F3 0F
+ MCE.emitByte(0xF3);
+ Need0FPrefix = true;
+ break;
+ case X86II::T8XD: // F2 0F 38
+ case X86II::TAXD: // F2 0F 3A
+ case X86II::XD: // F2 0F
+ MCE.emitByte(0xF2);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ MCE.emitByte(0xD8+
+ (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+ >> X86II::Op0Shift));
+ break; // Two-byte opcode prefix
+ default: llvm_unreachable("Invalid prefix!");
+ case 0: break; // No prefix!
+ }
+
+ // Handle REX prefix.
+ if (Is64BitMode) {
+ if (unsigned REX = determineREX(MI))
+ MCE.emitByte(0x40 | REX);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ MCE.emitByte(0x0F);
+
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::T8XD: // F2 0F 38
+ case X86II::T8XS: // F3 0F 38
+ case X86II::T8: // 0F 38
+ MCE.emitByte(0x38);
+ break;
+ case X86II::TAXD: // F2 0F 38
+ case X86II::TA: // 0F 3A
+ MCE.emitByte(0x3A);
+ break;
+ case X86II::A6: // 0F A6
+ MCE.emitByte(0xA6);
+ break;
+ case X86II::A7: // 0F A7
+ MCE.emitByte(0xA7);
+ break;
+ }
+}
+
+// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+// 0-7 and the difference between the 2 groups is given by the REX prefix.
+// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+// in 1's complement form, example:
+//
+// ModRM field => XMM9 => 1
+// VEX.VVVV => XMM9 => ~9
+//
+// See table 4-35 of Intel AVX Programming Reference for details.
+template<class CodeEmitter>
+unsigned char
+Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI,
+ unsigned OpNum) const {
+ unsigned SrcReg = MI.getOperand(OpNum).getReg();
+ unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
+ if (X86II::isX86_64ExtendedReg(SrcReg))
+ SrcRegNum |= 8;
+
+ // The registers represented through VEX_VVVV should
+ // be encoded in 1's complement form.
+ return (~SrcRegNum) & 0xf;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI) const {
+ switch (TSFlags & X86II::SegOvrMask) {
+ default: llvm_unreachable("Invalid segment!");
+ case 0:
+ // No segment override, check for explicit one on memory operand.
+ if (MemOperand != -1) { // If the instruction has a memory operand.
+ switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+ default: llvm_unreachable("Unknown segment register!");
+ case 0: break;
+ case X86::CS: MCE.emitByte(0x2E); break;
+ case X86::SS: MCE.emitByte(0x36); break;
+ case X86::DS: MCE.emitByte(0x3E); break;
+ case X86::ES: MCE.emitByte(0x26); break;
+ case X86::FS: MCE.emitByte(0x64); break;
+ case X86::GS: MCE.emitByte(0x65); break;
+ }
+ }
+ break;
+ case X86II::FS:
+ MCE.emitByte(0x64);
+ break;
+ case X86II::GS:
+ MCE.emitByte(0x65);
+ break;
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+
+ // VEX_R: opcode externsion equivalent to REX.R in
+ // 1's complement (inverted) form
+ //
+ // 1: Same as REX_R=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX_R=1 (64 bit mode only)
+ //
+ unsigned char VEX_R = 0x1;
+
+ // VEX_X: equivalent to REX.X, only used when a
+ // register is used for index in SIB Byte.
+ //
+ // 1: Same as REX.X=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX.X=1 (64-bit mode only)
+ unsigned char VEX_X = 0x1;
+
+ // VEX_B:
+ //
+ // 1: Same as REX_B=0 (ignored in 32-bit mode)
+ // 0: Same as REX_B=1 (64 bit mode only)
+ //
+ unsigned char VEX_B = 0x1;
+
+ // VEX_W: opcode specific (use like REX.W, or used for
+ // opcode extension, or ignored, depending on the opcode byte)
+ unsigned char VEX_W = 0;
+
+ // XOP: Use XOP prefix byte 0x8f instead of VEX.
+ unsigned char XOP = 0;
+
+ // VEX_5M (VEX m-mmmmm field):
+ //
+ // 0b00000: Reserved for future use
+ // 0b00001: implied 0F leading opcode
+ // 0b00010: implied 0F 38 leading opcode bytes
+ // 0b00011: implied 0F 3A leading opcode bytes
+ // 0b00100-0b11111: Reserved for future use
+ // 0b01000: XOP map select - 08h instructions with imm byte
+ // 0b10001: XOP map select - 09h instructions with no imm byte
+ unsigned char VEX_5M = 0x1;
+
+ // VEX_4V (VEX vvvv field): a register specifier
+ // (in 1's complement form) or 1111 if unused.
+ unsigned char VEX_4V = 0xf;
+
+ // VEX_L (Vector Length):
+ //
+ // 0: scalar or 128-bit vector
+ // 1: 256-bit vector
+ //
+ unsigned char VEX_L = 0;
+
+ // VEX_PP: opcode extension providing equivalent
+ // functionality of a SIMD prefix
+ //
+ // 0b00: None
+ // 0b01: 66
+ // 0b10: F3
+ // 0b11: F2
+ //
+ unsigned char VEX_PP = 0;
+
+ // Encode the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ VEX_PP = 0x01;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+ VEX_W = 1;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+ XOP = 1;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+ VEX_L = 1;
+
+ switch (TSFlags & X86II::Op0Mask) {
+ default: llvm_unreachable("Invalid prefix!");
+ case X86II::T8: // 0F 38
+ VEX_5M = 0x2;
+ break;
+ case X86II::TA: // 0F 3A
+ VEX_5M = 0x3;
+ break;
+ case X86II::T8XS: // F3 0F 38
+ VEX_PP = 0x2;
+ VEX_5M = 0x2;
+ break;
+ case X86II::T8XD: // F2 0F 38
+ VEX_PP = 0x3;
+ VEX_5M = 0x2;
+ break;
+ case X86II::TAXD: // F2 0F 3A
+ VEX_PP = 0x3;
+ VEX_5M = 0x3;
+ break;
+ case X86II::XS: // F3 0F
+ VEX_PP = 0x2;
+ break;
+ case X86II::XD: // F2 0F
+ VEX_PP = 0x3;
+ break;
+ case X86II::XOP8:
+ VEX_5M = 0x8;
+ break;
+ case X86II::XOP9:
+ VEX_5M = 0x9;
+ break;
+ case X86II::A6: // Bypass: Not used by VEX
+ case X86II::A7: // Bypass: Not used by VEX
+ case X86II::TB: // Bypass: Not used by VEX
+ case 0:
+ break; // No prefix!
+ }
+
+
+ // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ // Duplicate register.
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ break;
+ case X86II::MRMDestMem: {
+ // MRMDestMem instructions forms:
+ // MemAddr, src1(ModR/M)
+ // MemAddr, src1(VEX_4V), src2(ModR/M)
+ // MemAddr, src1(ModR/M), imm8
+ //
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+
+ CurOp = X86::AddrNumOperands;
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ const MachineOperand &MO = MI.getOperand(CurOp);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ VEX_R = 0x0;
+ break;
+ }
+ case X86II::MRMSrcMem:
+ // MRMSrcMem instructions forms:
+ // src1(ModR/M), MemAddr
+ // src1(ModR/M), src2(VEX_4V), MemAddr
+ // src1(ModR/M), MemAddr, imm8
+ // src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ VEX_R = 0x0;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, 1);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+ break;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ // MRM[0-9]m instructions forms:
+ // MemAddr
+ // src1(VEX_4V), MemAddr
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+ break;
+ }
+ case X86II::MRMSrcReg:
+ // MRMSrcReg instructions forms:
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M), src1(ModR/M)
+ // dst(ModR/M), src1(ModR/M), imm8
+ //
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ CurOp++;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ CurOp++;
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ CurOp++;
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ break;
+ case X86II::MRMDestReg:
+ // MRMDestReg instructions forms:
+ // dst(ModR/M), src(ModR/M)
+ // dst(ModR/M), src(ModR/M), imm8
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ CurOp++;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ break;
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ // MRM0r-MRM7r instructions forms:
+ // dst(VEX_4V), src(ModR/M), imm8
+ VEX_4V = getVEXRegisterEncoding(MI, 0);
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+ VEX_B = 0x0;
+ break;
+ default: // RawFrm
+ break;
+ }
+
+ // Emit segment override opcode prefix as needed.
+ emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+ if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+ MCE.emitByte(0xC5);
+ MCE.emitByte(LastByte | (VEX_R << 7));
+ return;
+ }
+
+ // 3 byte VEX prefix
+ MCE.emitByte(XOP ? 0x8F : 0xC4);
+ MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+ MCE.emitByte(LastByte | (VEX_W << 7));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+ const MCInstrDesc *Desc) {
+ DEBUG(dbgs() << MI);
+
+ // If this is a pseudo instruction, lower it.
+ switch (Desc->getOpcode()) {
+ case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break;
+ case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break;
+ case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break;
+ case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break;
+ case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break;
+ case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break;
+ case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break;
+ case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break;
+ case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break;
+ case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break;
+ case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
+ case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
+ case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
+ case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break;
+ case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
+ case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
+ case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
+ }
+
+
+ MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+ unsigned Opcode = Desc->Opcode;
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+
+ uint64_t TSFlags = Desc->TSFlags;
+
+ // Is this instruction encoded using the AVX VEX prefix?
+ bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+ // It uses the VEX.VVVV field?
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ const unsigned MemOp4_I8IMMOperand = 2;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+ if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+ if (!HasVEXPrefix)
+ emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+ else
+ emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+
+ unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
+ switch (TSFlags & X86II::FormMask) {
+ default:
+ llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+ case X86II::Pseudo:
+ // Remember the current PC offset, this is the PIC relocation
+ // base address.
+ switch (Opcode) {
+ default:
+ llvm_unreachable("pseudo instructions should be removed before code"
+ " emission");
+ // Do nothing for Int_MemBarrier - it's just a comment. Add a debug
+ // to make it slightly easier to see.
+ case X86::Int_MemBarrier:
+ DEBUG(dbgs() << "#MEMBARRIER\n");
+ break;
+
+ case TargetOpcode::INLINEASM:
+ // We allow inline assembler nodes with empty bodies - they can
+ // implicitly define registers, which is ok for JIT.
+ if (MI.getOperand(0).getSymbolName()[0])
+ report_fatal_error("JIT does not support inline asm!");
+ break;
+ case TargetOpcode::PROLOG_LABEL:
+ case TargetOpcode::GC_LABEL:
+ case TargetOpcode::EH_LABEL:
+ MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+ break;
+
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ break;
+ case X86::MOVPC32r: {
+ // This emits the "call" portion of this pseudo instruction.
+ MCE.emitByte(BaseOpcode);
+ emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags));
+ // Remember PIC base.
+ PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+ X86JITInfo *JTI = TM.getJITInfo();
+ JTI->setPICBase(MCE.getCurrentPCValue());
+ break;
+ }
+ }
+ CurOp = NumOps;
+ break;
+ case X86II::RawFrm: {
+ MCE.emitByte(BaseOpcode);
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+
+ DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
+ DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n");
+ DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n");
+ DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n");
+ DEBUG(dbgs() << "isImm " << MO.isImm() << "\n");
+
+ if (MO.isMBB()) {
+ emitPCRelativeBlockAddress(MO.getMBB());
+ break;
+ }
+
+ if (MO.isGlobal()) {
+ emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+ MO.getOffset(), 0);
+ break;
+ }
+
+ if (MO.isSymbol()) {
+ emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+ break;
+ }
+
+ // FIXME: Only used by hackish MCCodeEmitter, remove when dead.
+ if (MO.isJTI()) {
+ emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
+ break;
+ }
+
+ assert(MO.isImm() && "Unknown RawFrm operand!");
+ if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+ // Fix up immediate operand for pc relative calls.
+ intptr_t Imm = (intptr_t)MO.getImm();
+ Imm = Imm - MCE.getCurrentPCValue() - 4;
+ emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags));
+ } else
+ emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+ }
+
+ case X86II::AddRegFrm: {
+ MCE.emitByte(BaseOpcode +
+ getX86RegNum(MI.getOperand(CurOp++).getReg()));
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO1.isImm()) {
+ emitConstant(MO1.getImm(), Size);
+ break;
+ }
+
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64ri64i32)
+ rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
+ // This should not occur on Darwin for relocatable objects.
+ if (Opcode == X86::MOV64ri)
+ rt = X86::reloc_absolute_dword; // FIXME: add X86II flag?
+ if (MO1.isGlobal()) {
+ bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+ emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+ Indirect);
+ } else if (MO1.isSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isCPI())
+ emitConstPoolAddress(MO1.getIndex(), rt);
+ else if (MO1.isJTI())
+ emitJumpTableAddress(MO1.getIndex(), rt);
+ break;
+ }
+
+ case X86II::MRMDestReg: {
+ MCE.emitByte(BaseOpcode);
+
+ unsigned SrcRegNum = CurOp+1;
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ SrcRegNum++;
+
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMDestMem: {
+ MCE.emitByte(BaseOpcode);
+
+ unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ SrcRegNum++;
+ emitMemModRMByte(MI, CurOp,
+ getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+
+ case X86II::MRMSrcReg: {
+ MCE.emitByte(BaseOpcode);
+
+ unsigned SrcRegNum = CurOp+1;
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+ ++SrcRegNum;
+
+ emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ // 2 operands skipped with HasMemOp4, compensate accordingly
+ CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ int AddrOperands = X86::AddrNumOperands;
+ unsigned FirstMemOp = CurOp+1;
+ if (HasVEX_4V) {
+ ++AddrOperands;
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+ }
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ ++FirstMemOp;
+
+ MCE.emitByte(BaseOpcode);
+
+ intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+ X86II::getSizeOfImm(Desc->TSFlags) : 0;
+ emitMemModRMByte(MI, FirstMemOp,
+ getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
+ CurOp += AddrOperands + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+ (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO1.isImm()) {
+ emitConstant(MO1.getImm(), Size);
+ break;
+ }
+
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64ri32)
+ rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag?
+ if (MO1.isGlobal()) {
+ bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+ emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+ Indirect);
+ } else if (MO1.isSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isCPI())
+ emitConstPoolAddress(MO1.getIndex(), rt);
+ else if (MO1.isJTI())
+ emitJumpTableAddress(MO1.getIndex(), rt);
+ break;
+ }
+
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
+ (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
+ X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+ PCAdj);
+ CurOp += X86::AddrNumOperands;
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO.isImm()) {
+ emitConstant(MO.getImm(), Size);
+ break;
+ }
+
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64mi32)
+ rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag?
+ if (MO.isGlobal()) {
+ bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+ emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+ Indirect);
+ } else if (MO.isSymbol())
+ emitExternalSymbolAddress(MO.getSymbolName(), rt);
+ else if (MO.isCPI())
+ emitConstPoolAddress(MO.getIndex(), rt);
+ else if (MO.isJTI())
+ emitJumpTableAddress(MO.getIndex(), rt);
+ break;
+ }
+
+ case X86II::MRMInitReg:
+ MCE.emitByte(BaseOpcode);
+ // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ ++CurOp;
+ break;
+
+ case X86II::MRM_C1:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xC1);
+ break;
+ case X86II::MRM_C8:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xC8);
+ break;
+ case X86II::MRM_C9:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xC9);
+ break;
+ case X86II::MRM_CA:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xCA);
+ break;
+ case X86II::MRM_CB:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xCB);
+ break;
+ case X86II::MRM_E8:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xE8);
+ break;
+ case X86II::MRM_F0:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xF0);
+ break;
+ }
+
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
+ // The last source register of a 4 operand instruction in AVX is encoded
+ // in bits[7:4] of a immediate byte.
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+ const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+ : CurOp);
+ ++CurOp;
+ unsigned RegNum = getX86RegNum(MO.getReg()) << 4;
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ RegNum |= 1 << 7;
+ // If there is an additional 5th operand it must be an immediate, which
+ // is encoded in bits[3:0]
+ if (CurOp != NumOps) {
+ const MachineOperand &MIMM = MI.getOperand(CurOp++);
+ if (MIMM.isImm()) {
+ unsigned Val = MIMM.getImm();
+ assert(Val < 16 && "Immediate operand value out of range");
+ RegNum |= Val;
+ }
+ }
+ emitConstant(RegNum, 1);
+ } else {
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86II::getSizeOfImm(Desc->TSFlags));
+ }
+ }
+
+ if (!MI.isVariadic() && CurOp != NumOps) {
+#ifndef NDEBUG
+ dbgs() << "Cannot encode all operands of: " << MI << "\n";
+#endif
+ llvm_unreachable(0);
+ }
+
+ MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..cf44bd0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,2427 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86ISelLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel : public FastISel {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// RegInfo - X86 register info.
+ ///
+ const X86RegisterInfo *RegInfo;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf64;
+ bool X86ScalarSSEf32;
+
+public:
+ explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+ }
+
+ virtual bool TargetSelectInstruction(const Instruction *I);
+
+ /// \brief The specified machine instr operand is a vreg, and that
+ /// vreg is being provided by the specified load instruction. If possible,
+ /// try to fold the load as an operand to the instruction, returning true if
+ /// possible.
+ virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI);
+
+ virtual bool FastLowerArguments();
+
+#include "X86GenFastISel.inc"
+
+private:
+ bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
+
+ bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+
+ bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
+ bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
+
+ bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+ unsigned &ResultReg);
+
+ bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+ bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+ bool X86SelectLoad(const Instruction *I);
+
+ bool X86SelectStore(const Instruction *I);
+
+ bool X86SelectRet(const Instruction *I);
+
+ bool X86SelectCmp(const Instruction *I);
+
+ bool X86SelectZExt(const Instruction *I);
+
+ bool X86SelectBranch(const Instruction *I);
+
+ bool X86SelectShift(const Instruction *I);
+
+ bool X86SelectDivRem(const Instruction *I);
+
+ bool X86SelectSelect(const Instruction *I);
+
+ bool X86SelectTrunc(const Instruction *I);
+
+ bool X86SelectFPExt(const Instruction *I);
+ bool X86SelectFPTrunc(const Instruction *I);
+
+ bool X86VisitIntrinsicCall(const IntrinsicInst &I);
+ bool X86SelectCall(const Instruction *I);
+
+ bool DoSelectCall(const Instruction *I, const char *MemIntName);
+
+ const X86InstrInfo *getInstrInfo() const {
+ return getTargetMachine()->getInstrInfo();
+ }
+ const X86TargetMachine *getTargetMachine() const {
+ return static_cast<const X86TargetMachine *>(&TM);
+ }
+
+ unsigned TargetMaterializeConstant(const Constant *C);
+
+ unsigned TargetMaterializeAlloca(const AllocaInst *C);
+
+ unsigned TargetMaterializeFloatZero(const ConstantFP *CF);
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+ bool IsMemcpySmall(uint64_t Len);
+
+ bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len);
+};
+
+} // end anonymous namespace.
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+ EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+ if (evt == MVT::Other || !evt.isSimple())
+ // Unhandled type. Halt "fast" selection and bail.
+ return false;
+
+ VT = evt.getSimpleVT();
+ // For now, require SSE/SSE2 for performing floating-point operations,
+ // since x87 requires additional work.
+ if (VT == MVT::f64 && !X86ScalarSSEf64)
+ return false;
+ if (VT == MVT::f32 && !X86ScalarSSEf32)
+ return false;
+ // Similarly, no f80 support yet.
+ if (VT == MVT::f80)
+ return false;
+ // We only handle legal types. For example, on x86-32 the instruction
+ // selector contains all of the 64-bit instructions from x86-64,
+ // under the assumption that i64 won't be used if the target doesn't
+ // support it.
+ return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
+ unsigned &ResultReg) {
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = &X86::GR8RegClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = &X86::GR16RegClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ }
+
+ ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+ DL, TII.get(Opc), ResultReg), AM);
+ return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
+ // Get opcode and regclass of the output for the given store instruction.
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f80: // No f80 support yet.
+ default: return false;
+ case MVT::i1: {
+ // Mask out all but lowest bit.
+ unsigned AndResult = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
+ Val = AndResult;
+ }
+ // FALLTHROUGH, handling i1 as i8.
+ case MVT::i8: Opc = X86::MOV8mr; break;
+ case MVT::i16: Opc = X86::MOV16mr; break;
+ case MVT::i32: Opc = X86::MOV32mr; break;
+ case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+ case MVT::f32:
+ Opc = X86ScalarSSEf32 ?
+ (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
+ break;
+ case MVT::f64:
+ Opc = X86ScalarSSEf64 ?
+ (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
+ break;
+ case MVT::v4f32:
+ Opc = X86::MOVAPSmr;
+ break;
+ case MVT::v2f64:
+ Opc = X86::MOVAPDmr;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ Opc = X86::MOVDQAmr;
+ break;
+ }
+
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+ DL, TII.get(Opc)), AM).addReg(Val);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+ const X86AddressMode &AM) {
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Val))
+ Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
+
+ // If this is a store of a simple constant, fold the constant into the store.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ unsigned Opc = 0;
+ bool Signed = true;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
+ case MVT::i8: Opc = X86::MOV8mi; break;
+ case MVT::i16: Opc = X86::MOV16mi; break;
+ case MVT::i32: Opc = X86::MOV32mi; break;
+ case MVT::i64:
+ // Must be a 32-bit sign extended value.
+ if (isInt<32>(CI->getSExtValue()))
+ Opc = X86::MOV64mi32;
+ break;
+ }
+
+ if (Opc) {
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+ DL, TII.get(Opc)), AM)
+ .addImm(Signed ? (uint64_t) CI->getSExtValue() :
+ CI->getZExtValue());
+ return true;
+ }
+ }
+
+ unsigned ValReg = getRegForValue(Val);
+ if (ValReg == 0)
+ return false;
+
+ return X86FastEmitStore(VT, ValReg, AM);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+ unsigned Src, EVT SrcVT,
+ unsigned &ResultReg) {
+ unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+ Src, /*TODO: Kill=*/false);
+ if (RR == 0)
+ return false;
+
+ ResultReg = RR;
+ return true;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = NULL;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ // Don't walk into other basic blocks; it's possible we haven't
+ // visited them yet, so the instructions may not yet be assigned
+ // virtual registers.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::Alloca: {
+ // Do static allocas.
+ const AllocaInst *A = cast<AllocaInst>(V);
+ DenseMap<const AllocaInst*, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(A);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = SI->second;
+ return true;
+ }
+ break;
+ }
+
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+ uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+ // They have to fit in the 32-bit signed displacement field though.
+ if (isInt<32>(Disp)) {
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM);
+ }
+ }
+ break;
+ }
+
+ case Instruction::GetElementPtr: {
+ X86AddressMode SavedAM = AM;
+
+ // Pattern-match simple GEPs.
+ uint64_t Disp = (int32_t)AM.Disp;
+ unsigned IndexReg = AM.IndexReg;
+ unsigned Scale = AM.Scale;
+ gep_type_iterator GTI = gep_type_begin(U);
+ // Iterate through the indices, folding what we can. Constants can be
+ // folded, and one dynamic index can be handled, if the scale is supported.
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ const StructLayout *SL = TD.getStructLayout(STy);
+ Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+ continue;
+ }
+
+ // A array/variable index is always of the form i*S where S is the
+ // constant scale size. See if we can push the scale into immediates.
+ uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ Disp += CI->getSExtValue() * S;
+ break;
+ }
+ if (isa<AddOperator>(Op) &&
+ (!isa<Instruction>(Op) ||
+ FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+ == FuncInfo.MBB) &&
+ isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+ // An add (in the same block) with a constant operand. Fold the
+ // constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ Disp += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ if (IndexReg == 0 &&
+ (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+ (S == 1 || S == 2 || S == 4 || S == 8)) {
+ // Scaled-index addressing.
+ Scale = S;
+ IndexReg = getRegForGEPIndex(Op).first;
+ if (IndexReg == 0)
+ return false;
+ break;
+ }
+ // Unsupported.
+ goto unsupported_gep;
+ }
+ }
+ // Check for displacement overflow.
+ if (!isInt<32>(Disp))
+ break;
+ // Ok, the GEP indices were covered by constant-offset and scaled-index
+ // addressing. Update the address state and move on to examining the base.
+ AM.IndexReg = IndexReg;
+ AM.Scale = Scale;
+ AM.Disp = (uint32_t)Disp;
+ if (X86SelectAddress(U->getOperand(0), AM))
+ return true;
+
+ // If we couldn't merge the gep value into this addr mode, revert back to
+ // our address and just match the value instead of completely failing.
+ AM = SavedAM;
+ break;
+ unsupported_gep:
+ // Ok, the GEP indices weren't all covered.
+ break;
+ }
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // Can't handle TLS yet.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+ // it works...).
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ if (const GlobalVariable *GVar =
+ dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // RIP-relative addresses can't have additional register operands, so if
+ // we've already folded stuff into the addressing mode, just force the
+ // global value into its own register, which we can use as the basereg.
+ if (!Subtarget->isPICStyleRIPRel() ||
+ (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+ // Okay, we've committed to selecting this global. Set up the address.
+ AM.GV = GV;
+
+ // Allow the subtarget to classify the global.
+ unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+ // If this reference is relative to the pic base, set it now.
+ if (isGlobalRelativeToPICBase(GVFlags)) {
+ // FIXME: How do we know Base.Reg is free??
+ AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ }
+
+ // Unless the ABI requires an extra load, return a direct reference to
+ // the global.
+ if (!isGlobalStubReference(GVFlags)) {
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ }
+ AM.GVOpFlags = GVFlags;
+ return true;
+ }
+
+ // Ok, we need to do a load from a stub. If we've already loaded from
+ // this stub, reuse the loaded pointer, otherwise emit the load now.
+ DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+ unsigned LoadReg;
+ if (I != LocalValueMap.end() && I->second != 0) {
+ LoadReg = I->second;
+ } else {
+ // Issue load from stub.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ X86AddressMode StubAM;
+ StubAM.Base.Reg = AM.Base.Reg;
+ StubAM.GV = GV;
+ StubAM.GVOpFlags = GVFlags;
+
+ // Prepare for inserting code in the local-value area.
+ SavePoint SaveInsertPt = enterLocalValueArea();
+
+ if (TLI.getPointerTy() == MVT::i64) {
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+
+ if (Subtarget->isPICStyleRIPRel())
+ StubAM.Base.Reg = X86::RIP;
+ } else {
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ }
+
+ LoadReg = createResultReg(RC);
+ MachineInstrBuilder LoadMI =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+ addFullAddress(LoadMI, StubAM);
+
+ // Ok, back to normal mode.
+ leaveLocalValueArea(SaveInsertPt);
+
+ // Prevent loading GV stub multiple times in same MBB.
+ LocalValueMap[V] = LoadReg;
+ }
+
+ // Now construct the final address. Note that the Disp, Scale,
+ // and Index values may already be set here.
+ AM.Base.Reg = LoadReg;
+ AM.GV = 0;
+ return true;
+ }
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = NULL;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ Opcode = I->getOpcode();
+ U = I;
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectCallAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (Subtarget->isPICStyleRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle DLLImport.
+ if (GV->hasDLLImportLinkage())
+ return false;
+
+ // Can't handle TLS.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Okay, we've committed to selecting this global. Set up the basic address.
+ AM.GV = GV;
+
+ // No ABI requires an extra load for anything other than DLLImport, which
+ // we rejected above. Return a direct reference to the global.
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+ } else if (Subtarget->isPICStyleGOT()) {
+ AM.GVOpFlags = X86II::MO_GOTOFF;
+ }
+
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+ // Atomic stores need special handling.
+ const StoreInst *S = cast<StoreInst>(I);
+
+ if (S->isAtomic())
+ return false;
+
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(I->getOperand(1), AM))
+ return false;
+
+ return X86FastEmitStore(VT, I->getOperand(0), AM);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+ const X86MachineFunctionInfo *X86MFInfo =
+ FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC != CallingConv::C &&
+ CC != CallingConv::Fast &&
+ CC != CallingConv::X86_FastCall)
+ return false;
+
+ if (Subtarget->isTargetWin64())
+ return false;
+
+ // Don't handle popping bytes on return for now.
+ if (X86MFInfo->getBytesToPopOnReturn() != 0)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (F.isVarArg())
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+ I->getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ const Value *RV = Ret->getOperand(0);
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+
+ // Don't bother handling odd stuff for now.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ return false;
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ // The calling-convention tables for x87 returns don't tell
+ // the whole story.
+ if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ EVT SrcVT = TLI.getValueType(RV->getType());
+ EVT DstVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (SrcVT != DstVT) {
+ if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+ return false;
+
+ if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+ return false;
+
+ assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+ if (SrcVT == MVT::i1) {
+ if (Outs[0].Flags.isSExt())
+ return false;
+ SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+ }
+ unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+ ISD::SIGN_EXTEND;
+ SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+ SrcReg, /*TODO: Kill=*/false);
+ }
+
+ // Make the copy.
+ unsigned DstReg = VA.getLocReg();
+ const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!SrcRC->contains(DstReg))
+ return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ DstReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ // The x86-64 ABI for returning structs by value requires that we copy
+ // the sret argument into %rax for the return. We saved the argument into
+ // a virtual register in the entry block, so now we copy the value out
+ // and into %rax. We also do the same with %eax for Win32.
+ if (F.hasStructRetAttr() &&
+ (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+ unsigned Reg = X86MFInfo->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments()!");
+ unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ RetReg).addReg(Reg);
+ RetRegs.push_back(RetReg);
+ }
+
+ // Now emit the RET.
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+ // Atomic loads need special handling.
+ if (cast<LoadInst>(I)->isAtomic())
+ return false;
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(I->getOperand(0), AM))
+ return false;
+
+ unsigned ResultReg = 0;
+ if (X86FastEmitLoad(VT, AM, ResultReg)) {
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ return false;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+ bool HasAVX = Subtarget->hasAVX();
+ bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+ bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8: return X86::CMP8rr;
+ case MVT::i16: return X86::CMP16rr;
+ case MVT::i32: return X86::CMP32rr;
+ case MVT::i64: return X86::CMP64rr;
+ case MVT::f32:
+ return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+ case MVT::f64:
+ return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+ }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ // Otherwise, we can't fold the immediate into this comparison.
+ default: return 0;
+ case MVT::i8: return X86::CMP8ri;
+ case MVT::i16: return X86::CMP16ri;
+ case MVT::i32: return X86::CMP32ri;
+ case MVT::i64:
+ // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+ // field.
+ if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+ return X86::CMP64ri32;
+ return 0;
+ }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
+ EVT VT) {
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (Op0Reg == 0) return false;
+
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Op1))
+ Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
+
+ // We have two options: compare with register or immediate. If the RHS of
+ // the compare is an immediate that we can fold into this compare, use
+ // CMPri, otherwise use CMPrr.
+ if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareImmOpc))
+ .addReg(Op0Reg)
+ .addImm(Op1C->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+ if (CompareOpc == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(Op1);
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc))
+ .addReg(Op0Reg)
+ .addReg(Op1Reg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ unsigned ResultReg = createResultReg(&X86::GR8RegClass);
+ unsigned SetCCOpc;
+ bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
+ switch (CI->getPredicate()) {
+ case CmpInst::FCMP_OEQ: {
+ if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ return false;
+
+ unsigned EReg = createResultReg(&X86::GR8RegClass);
+ unsigned NPReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(X86::SETNPr), NPReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ case CmpInst::FCMP_UNE: {
+ if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ return false;
+
+ unsigned NEReg = createResultReg(&X86::GR8RegClass);
+ unsigned PReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg)
+ .addReg(PReg).addReg(NEReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
+ case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break;
+ case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+ case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
+ case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break;
+ case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
+ case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break;
+ case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break;
+ case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
+ case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+
+ case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
+ case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+ case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
+ case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
+ case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+ case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break;
+ case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
+ case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break;
+ case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
+ default:
+ return false;
+ }
+
+ const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if (SwapArgs)
+ std::swap(Op0, Op1);
+
+ // Emit a compare of Op0/Op1.
+ if (!X86FastEmitCompare(Op0, Op1, VT))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+ // Handle zero-extension from i1 to i8, which is common.
+ if (!I->getOperand(0)->getType()->isIntegerTy(1))
+ return false;
+
+ EVT DstVT = TLI.getValueType(I->getType());
+ if (!TLI.isTypeLegal(DstVT))
+ return false;
+
+ unsigned ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0)
+ return false;
+
+ // Set the high bits to zero.
+ ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ if (ResultReg == 0)
+ return false;
+
+ if (DstVT != MVT::i8) {
+ ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+ ResultReg, /*Kill=*/true);
+ if (ResultReg == 0)
+ return false;
+ }
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+ // Unconditional branches are selected by tablegen-generated code.
+ // Handle a conditional branch.
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Fold the common case of a conditional branch with a comparison
+ // in the same block (values defined on other blocks may not have
+ // initialized registers).
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+ EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+ // Try to take advantage of fallthrough opportunities.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
+ unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
+
+ switch (Predicate) {
+ case CmpInst::FCMP_OEQ:
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::FCMP_UNE;
+ // FALL THROUGH
+ case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+ case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break;
+ case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
+ case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break;
+ case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break;
+ case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+ case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break;
+ case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break;
+ case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break;
+ case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break;
+ case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break;
+ case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break;
+ case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
+
+ case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break;
+ case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+ case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break;
+ case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
+ case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break;
+ case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
+ case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break;
+ case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break;
+ case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break;
+ case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break;
+ default:
+ return false;
+ }
+
+ const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if (SwapArgs)
+ std::swap(Op0, Op1);
+
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(Op0, Op1, VT))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc))
+ .addMBB(TrueMBB);
+
+ if (Predicate == CmpInst::FCMP_UNE) {
+ // X86 requires a second branch to handle UNE (and OEQ,
+ // which is mapped to UNE above).
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JP_4))
+ .addMBB(TrueMBB);
+ }
+
+ FastEmitBranch(FalseMBB, DL);
+ FuncInfo.MBB->addSuccessor(TrueMBB);
+ return true;
+ }
+ } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+ // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+ // typically happen for _Bool and C++ bools.
+ MVT SourceVT;
+ if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+ isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+ unsigned TestOpc = 0;
+ switch (SourceVT.SimpleTy) {
+ default: break;
+ case MVT::i8: TestOpc = X86::TEST8ri; break;
+ case MVT::i16: TestOpc = X86::TEST16ri; break;
+ case MVT::i32: TestOpc = X86::TEST32ri; break;
+ case MVT::i64: TestOpc = X86::TEST64ri32; break;
+ }
+ if (TestOpc) {
+ unsigned OpReg = getRegForValue(TI->getOperand(0));
+ if (OpReg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc))
+ .addReg(OpReg).addImm(1);
+
+ unsigned JmpOpc = X86::JNE_4;
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ JmpOpc = X86::JE_4;
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc))
+ .addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB, DL);
+ FuncInfo.MBB->addSuccessor(TrueMBB);
+ return true;
+ }
+ }
+ }
+
+ // Otherwise do a clumsy setcc and re-test it.
+ // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+ // in an explicit cast, so make sure to handle that correctly.
+ unsigned OpReg = getRegForValue(BI->getCondition());
+ if (OpReg == 0) return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri))
+ .addReg(OpReg).addImm(1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4))
+ .addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB, DL);
+ FuncInfo.MBB->addSuccessor(TrueMBB);
+ return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+ unsigned CReg = 0, OpReg = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (I->getType()->isIntegerTy(8)) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(16)) {
+ CReg = X86::CX;
+ RC = &X86::GR16RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL16rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(32)) {
+ CReg = X86::ECX;
+ RC = &X86::GR32RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL32rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(64)) {
+ CReg = X86::RCX;
+ RC = &X86::GR64RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL64rCL; break;
+ default: return false;
+ }
+ } else {
+ return false;
+ }
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ CReg).addReg(Op1Reg);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(TargetOpcode::KILL), X86::CL)
+ .addReg(CReg, RegState::Kill);
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpReg), ResultReg)
+ .addReg(Op0Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+ const static unsigned NumTypes = 4; // i8, i16, i32, i64
+ const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
+ const static bool S = true; // IsSigned
+ const static bool U = false; // !IsSigned
+ const static unsigned Copy = TargetOpcode::COPY;
+ // For the X86 DIV/IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended or zero-extended
+ // into highreg. The exception is i8, where the dividend is defined
+ // as a single register rather than a register pair, and we
+ // therefore directly sign-extend or zero-extend the dividend into
+ // lowreg, instead of copying, and ignore the highreg.
+ const static struct DivRemEntry {
+ // The following portion depends only on the data type.
+ const TargetRegisterClass *RC;
+ unsigned LowInReg; // low part of the register pair
+ unsigned HighInReg; // high part of the register pair
+ // The following portion depends on both the data type and the operation.
+ struct DivRemResult {
+ unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
+ unsigned OpSignExtend; // Opcode for sign-extending lowreg into
+ // highreg, or copying a zero into highreg.
+ unsigned OpCopy; // Opcode for copying dividend into lowreg, or
+ // zero/sign-extending into lowreg for i8.
+ unsigned DivRemResultReg; // Register containing the desired result.
+ bool IsOpSigned; // Whether to use signed or unsigned form.
+ } ResultTable[NumOps];
+ } OpTable[NumTypes] = {
+ { &X86::GR8RegClass, X86::AX, 0, {
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
+ }
+ }, // i8
+ { &X86::GR16RegClass, X86::AX, X86::DX, {
+ { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
+ { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
+ { X86::DIV16r, X86::MOV16r0, Copy, X86::AX, U }, // UDiv
+ { X86::DIV16r, X86::MOV16r0, Copy, X86::DX, U }, // URem
+ }
+ }, // i16
+ { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
+ }
+ }, // i32
+ { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+ { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
+ { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
+ { X86::DIV64r, X86::MOV64r0, Copy, X86::RAX, U }, // UDiv
+ { X86::DIV64r, X86::MOV64r0, Copy, X86::RDX, U }, // URem
+ }
+ }, // i64
+ };
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned TypeIndex, OpIndex;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8: TypeIndex = 0; break;
+ case MVT::i16: TypeIndex = 1; break;
+ case MVT::i32: TypeIndex = 2; break;
+ case MVT::i64: TypeIndex = 3;
+ if (!Subtarget->is64Bit())
+ return false;
+ break;
+ }
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected div/rem opcode");
+ case Instruction::SDiv: OpIndex = 0; break;
+ case Instruction::SRem: OpIndex = 1; break;
+ case Instruction::UDiv: OpIndex = 2; break;
+ case Instruction::URem: OpIndex = 3; break;
+ }
+
+ const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+ const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0)
+ return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0)
+ return false;
+
+ // Move op0 into low-order input register.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+ // Zero-extend or sign-extend into high-order input register.
+ if (OpEntry.OpSignExtend) {
+ if (OpEntry.IsOpSigned)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpSignExtend));
+ else
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpSignExtend), TypeEntry.HighInReg);
+ }
+ // Generate the DIV/IDIV instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+ // Copy output register into result register.
+ unsigned ResultReg = createResultReg(TypeEntry.RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Copy), ResultReg).addReg(OpEntry.DivRemResultReg);
+ UpdateValueMap(I, ResultReg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ // We only use cmov here, if we don't have a cmov instruction bail.
+ if (!Subtarget->hasCMov()) return false;
+
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (VT == MVT::i16) {
+ Opc = X86::CMOVE16rr;
+ RC = &X86::GR16RegClass;
+ } else if (VT == MVT::i32) {
+ Opc = X86::CMOVE32rr;
+ RC = &X86::GR32RegClass;
+ } else if (VT == MVT::i64) {
+ Opc = X86::CMOVE64rr;
+ RC = &X86::GR64RegClass;
+ } else {
+ return false;
+ }
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ unsigned Op2Reg = getRegForValue(I->getOperand(2));
+ if (Op2Reg == 0) return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
+ .addReg(Op0Reg).addReg(Op0Reg);
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+ .addReg(Op1Reg).addReg(Op2Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+ // fpext from float to double.
+ if (X86ScalarSSEf64 &&
+ I->getType()->isDoubleTy()) {
+ const Value *V = I->getOperand(0);
+ if (V->getType()->isFloatTy()) {
+ unsigned OpReg = getRegForValue(V);
+ if (OpReg == 0) return false;
+ unsigned ResultReg = createResultReg(&X86::FR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(X86::CVTSS2SDrr), ResultReg)
+ .addReg(OpReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+ if (X86ScalarSSEf64) {
+ if (I->getType()->isFloatTy()) {
+ const Value *V = I->getOperand(0);
+ if (V->getType()->isDoubleTy()) {
+ unsigned OpReg = getRegForValue(V);
+ if (OpReg == 0) return false;
+ unsigned ResultReg = createResultReg(&X86::FR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(X86::CVTSD2SSrr), ResultReg)
+ .addReg(OpReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+ EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(I->getType());
+
+ // This code only handles truncation to byte.
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
+ return false;
+ if (!TLI.isTypeLegal(SrcVT))
+ return false;
+
+ unsigned InputReg = getRegForValue(I->getOperand(0));
+ if (!InputReg)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return false;
+
+ if (SrcVT == MVT::i8) {
+ // Truncate from i8 to i1; no code needed.
+ UpdateValueMap(I, InputReg);
+ return true;
+ }
+
+ if (!Subtarget->is64Bit()) {
+ // If we're on x86-32; we can't extract an i8 from a general register.
+ // First issue a copy to GR16_ABCD or GR32_ABCD.
+ const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
+ (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
+ (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
+ unsigned CopyReg = createResultReg(CopyRC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ CopyReg).addReg(InputReg);
+ InputReg = CopyReg;
+ }
+
+ // Issue an extract_subreg.
+ unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+ InputReg, /*Kill=*/true,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+ return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len) {
+
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!IsMemcpySmall(Len))
+ return false;
+
+ bool i64Legal = Subtarget->is64Bit();
+
+ // We don't care about alignment here since we just emit integer accesses.
+ while (Len) {
+ MVT VT;
+ if (Len >= 8 && i64Legal)
+ VT = MVT::i64;
+ else if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else {
+ VT = MVT::i8;
+ }
+
+ unsigned Reg;
+ bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
+ RV &= X86FastEmitStore(VT, Reg, DestAM);
+ assert(RV && "Failed to emit load or store??");
+
+ unsigned Size = VT.getSizeInBits()/8;
+ Len -= Size;
+ DestAM.Disp += Size;
+ SrcAM.Disp += Size;
+ }
+
+ return true;
+}
+
+bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
+ // FIXME: Handle more intrinsics.
+ switch (I.getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::memcpy: {
+ const MemCpyInst &MCI = cast<MemCpyInst>(I);
+ // Don't handle volatile or variable length memcpys.
+ if (MCI.isVolatile())
+ return false;
+
+ if (isa<ConstantInt>(MCI.getLength())) {
+ // Small memcpy's are common enough that we want to do them
+ // without a call if possible.
+ uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
+ if (IsMemcpySmall(Len)) {
+ X86AddressMode DestAM, SrcAM;
+ if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
+ !X86SelectAddress(MCI.getRawSource(), SrcAM))
+ return false;
+ TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+ return true;
+ }
+ }
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255)
+ return false;
+
+ return DoSelectCall(&I, "memcpy");
+ }
+ case Intrinsic::memset: {
+ const MemSetInst &MSI = cast<MemSetInst>(I);
+
+ if (MSI.isVolatile())
+ return false;
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MSI.getDestAddressSpace() > 255)
+ return false;
+
+ return DoSelectCall(&I, "memset");
+ }
+ case Intrinsic::stackprotector: {
+ // Emit code to store the stack guard onto the stack.
+ EVT PtrTy = TLI.getPointerTy();
+
+ const Value *Op1 = I.getArgOperand(0); // The guard's value.
+ const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+
+ // Grab the frame index.
+ X86AddressMode AM;
+ if (!X86SelectAddress(Slot, AM)) return false;
+ if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+ return true;
+ }
+ case Intrinsic::dbg_declare: {
+ const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I);
+ X86AddressMode AM;
+ assert(DI->getAddress() && "Null address should be checked earlier!");
+ if (!X86SelectAddress(DI->getAddress(), AM))
+ return false;
+ const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+ // FIXME may need to add RegState::Debug to any registers produced,
+ // although ESP/EBP should be the only ones at the moment.
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II), AM).
+ addImm(0).addMetadata(DI->getVariable());
+ return true;
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TRAP));
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow: {
+ // FIXME: Should fold immediates.
+
+ // Replace "add with overflow" intrinsics with an "add" instruction followed
+ // by a seto/setc instruction.
+ const Function *Callee = I.getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ const Value *Op1 = I.getArgOperand(0);
+ const Value *Op2 = I.getArgOperand(1);
+ unsigned Reg1 = getRegForValue(Op1);
+ unsigned Reg2 = getRegForValue(Op2);
+
+ if (Reg1 == 0 || Reg2 == 0)
+ // FIXME: Handle values *not* in registers.
+ return false;
+
+ unsigned OpC = 0;
+ if (VT == MVT::i32)
+ OpC = X86::ADD32rr;
+ else if (VT == MVT::i64)
+ OpC = X86::ADD64rr;
+ else
+ return false;
+
+ // The call to CreateRegs builds two sequential registers, to store the
+ // both the returned values.
+ unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
+ .addReg(Reg1).addReg(Reg2);
+
+ unsigned Opc = X86::SETBr;
+ if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
+ Opc = X86::SETOr;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1);
+
+ UpdateValueMap(&I, ResultReg, 2);
+ return true;
+ }
+ }
+}
+
+bool X86FastISel::FastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (Subtarget->isTargetWin64())
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C)
+ return false;
+
+ if (!Subtarget->is64Bit())
+ return false;
+
+ // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+ unsigned Idx = 1;
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I, ++Idx) {
+ if (Idx > 6)
+ return false;
+
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = I->getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(ArgTy);
+ if (!ArgVT.isSimple()) return false;
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ case MVT::i64:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ static const uint16_t GPR32ArgRegs[] = {
+ X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+ };
+ static const uint16_t GPR64ArgRegs[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+ };
+
+ Idx = 0;
+ const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
+ const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I, ++Idx) {
+ if (I->use_empty())
+ continue;
+ bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
+ const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
+ unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(DstReg, getKillRegState(true));
+ UpdateValueMap(I, ResultReg);
+ }
+ return true;
+}
+
+bool X86FastISel::X86SelectCall(const Instruction *I) {
+ const CallInst *CI = cast<CallInst>(I);
+ const Value *Callee = CI->getCalledValue();
+
+ // Can't handle inline asm yet.
+ if (isa<InlineAsm>(Callee))
+ return false;
+
+ // Handle intrinsic calls.
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
+ return X86VisitIntrinsicCall(*II);
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (cast<CallInst>(I)->isTailCall())
+ return false;
+
+ return DoSelectCall(I, 0);
+}
+
+static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
+ const ImmutableCallSite &CS) {
+ if (Subtarget.is64Bit())
+ return 0;
+ if (Subtarget.isTargetWindows())
+ return 0;
+ CallingConv::ID CC = CS.getCallingConv();
+ if (CC == CallingConv::Fast || CC == CallingConv::GHC)
+ return 0;
+ if (!CS.paramHasAttr(1, Attribute::StructRet))
+ return 0;
+ if (CS.paramHasAttr(1, Attribute::InReg))
+ return 0;
+ return 4;
+}
+
+// Select either a call, or an llvm.memcpy/memmove/memset intrinsic
+bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
+ const CallInst *CI = cast<CallInst>(I);
+ const Value *Callee = CI->getCalledValue();
+
+ // Handle only C and fastcc calling conventions for now.
+ ImmutableCallSite CS(CI);
+ CallingConv::ID CC = CS.getCallingConv();
+ if (CC != CallingConv::C && CC != CallingConv::Fast &&
+ CC != CallingConv::X86_FastCall)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ return false;
+
+ PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+ FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+ bool isVarArg = FTy->isVarArg();
+
+ // Don't know how to handle Win64 varargs yet. Nothing special needed for
+ // x86-32. Special handling for x86-64 is implemented.
+ if (isVarArg && Subtarget->isTargetWin64())
+ return false;
+
+ // Fast-isel doesn't know about callee-pop yet.
+ if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg,
+ TM.Options.GuaranteedTailCallOpt))
+ return false;
+
+ // Check whether the function can return without sret-demotion.
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI);
+ bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
+ *FuncInfo.MF, FTy->isVarArg(),
+ Outs, FTy->getContext());
+ if (!CanLowerReturn)
+ return false;
+
+ // Materialize callee address in a register. FIXME: GV address can be
+ // handled with a CALLpcrel32 instead.
+ X86AddressMode CalleeAM;
+ if (!X86SelectCallAddress(Callee, CalleeAM))
+ return false;
+ unsigned CalleeOp = 0;
+ const GlobalValue *GV = 0;
+ if (CalleeAM.GV != 0) {
+ GV = CalleeAM.GV;
+ } else if (CalleeAM.Base.Reg != 0) {
+ CalleeOp = CalleeAM.Base.Reg;
+ } else
+ return false;
+
+ // Deal with call operands first.
+ SmallVector<const Value *, 8> ArgVals;
+ SmallVector<unsigned, 8> Args;
+ SmallVector<MVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+ unsigned arg_size = CS.arg_size();
+ Args.reserve(arg_size);
+ ArgVals.reserve(arg_size);
+ ArgVTs.reserve(arg_size);
+ ArgFlags.reserve(arg_size);
+ for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+ i != e; ++i) {
+ // If we're lowering a mem intrinsic instead of a regular call, skip the
+ // last two arguments, which should not passed to the underlying functions.
+ if (MemIntName && e-i <= 2)
+ break;
+ Value *ArgVal = *i;
+ ISD::ArgFlagsTy Flags;
+ unsigned AttrInd = i - CS.arg_begin() + 1;
+ if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+ Flags.setSExt();
+ if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+ Flags.setZExt();
+
+ if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
+ PointerType *Ty = cast<PointerType>(ArgVal->getType());
+ Type *ElementTy = Ty->getElementType();
+ unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
+ unsigned FrameAlign = CS.getParamAlignment(AttrInd);
+ if (!FrameAlign)
+ FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+ Flags.setByVal();
+ Flags.setByValSize(FrameSize);
+ Flags.setByValAlign(FrameAlign);
+ if (!IsMemcpySmall(FrameSize))
+ return false;
+ }
+
+ if (CS.paramHasAttr(AttrInd, Attribute::InReg))
+ Flags.setInReg();
+ if (CS.paramHasAttr(AttrInd, Attribute::Nest))
+ Flags.setNest();
+
+ // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
+ // instruction. This is safe because it is common to all fastisel supported
+ // calling conventions on x86.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) {
+ if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 ||
+ CI->getBitWidth() == 16) {
+ if (Flags.isSExt())
+ ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext()));
+ else
+ ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext()));
+ }
+ }
+
+ unsigned ArgReg;
+
+ // Passing bools around ends up doing a trunc to i1 and passing it.
+ // Codegen this as an argument + "and 1".
+ if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) &&
+ cast<TruncInst>(ArgVal)->getParent() == I->getParent() &&
+ ArgVal->hasOneUse()) {
+ ArgVal = cast<TruncInst>(ArgVal)->getOperand(0);
+ ArgReg = getRegForValue(ArgVal);
+ if (ArgReg == 0) return false;
+
+ MVT ArgVT;
+ if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false;
+
+ ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg,
+ ArgVal->hasOneUse(), 1);
+ } else {
+ ArgReg = getRegForValue(ArgVal);
+ }
+
+ if (ArgReg == 0) return false;
+
+ Type *ArgTy = ArgVal->getType();
+ MVT ArgVT;
+ if (!isTypeLegal(ArgTy, ArgVT))
+ return false;
+ if (ArgVT == MVT::x86mmx)
+ return false;
+ unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+ Flags.setOrigAlign(OriginalAlignment);
+
+ Args.push_back(ArgReg);
+ ArgVals.push_back(ArgVal);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
+ I->getParent()->getContext());
+
+ // Allocate shadow area for Win64
+ if (Subtarget->isTargetWin64())
+ CCInfo.AllocateStack(32, 8);
+
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackDown))
+ .addImm(NumBytes);
+
+ // Process argument: walk the register/memloc assignments, inserting
+ // copies / loads.
+ SmallVector<unsigned, 4> RegArgs;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ unsigned Arg = Args[VA.getValNo()];
+ EVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::ZExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::AExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+
+ assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::BCvt: {
+ unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(),
+ ISD::BITCAST, Arg, /*TODO: Kill=*/false);
+ assert(BC != 0 && "Failed to emit a bitcast!");
+ Arg = BC;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::VExt:
+ // VExt has not been implemented, so this should be impossible to reach
+ // for now. However, fallback to Selection DAG isel once implemented.
+ return false;
+ case CCValAssign::Indirect:
+ // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+ // support this.
+ return false;
+ }
+
+ if (VA.isRegLoc()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ VA.getLocReg()).addReg(Arg);
+ RegArgs.push_back(VA.getLocReg());
+ } else {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ X86AddressMode AM;
+ AM.Base.Reg = RegInfo->getStackRegister();
+ AM.Disp = LocMemOffset;
+ const Value *ArgVal = ArgVals[VA.getValNo()];
+ ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
+
+ if (Flags.isByVal()) {
+ X86AddressMode SrcAM;
+ SrcAM.Base.Reg = Arg;
+ bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize());
+ assert(Res && "memcpy length already checked!"); (void)Res;
+ } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+ // If this is a really simple value, emit this with the Value* version
+ // of X86FastEmitStore. If it isn't simple, we don't want to do this,
+ // as it can cause us to reevaluate the argument.
+ if (!X86FastEmitStore(ArgVT, ArgVal, AM))
+ return false;
+ } else {
+ if (!X86FastEmitStore(ArgVT, Arg, AM))
+ return false;
+ }
+ }
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (Subtarget->isPICStyleGOT()) {
+ unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ X86::EBX).addReg(Base);
+ }
+
+ if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+ // Count the number of XMM registers allocated.
+ static const uint16_t XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri),
+ X86::AL).addImm(NumXMMRegs);
+ }
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ if (CalleeOp) {
+ // Register-indirect call.
+ unsigned CallOpc;
+ if (Subtarget->is64Bit())
+ CallOpc = X86::CALL64r;
+ else
+ CallOpc = X86::CALL32r;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
+ .addReg(CalleeOp);
+
+ } else {
+ // Direct call.
+ assert(GV && "Not a direct call");
+ unsigned CallOpc;
+ if (Subtarget->is64Bit())
+ CallOpc = X86::CALL64pcrel32;
+ else
+ CallOpc = X86::CALLpcrel32;
+
+ // See if we need any target-specific flags on the GV operand.
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+ // external symbols most go through the PLT in PIC mode. If the symbol
+ // has hidden or protected visibility, or if it is static or local, then
+ // we don't need to use the PLT - we can directly call it.
+ if (Subtarget->isTargetELF() &&
+ TM.getRelocationModel() == Reloc::PIC_ &&
+ GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ (GV->isDeclaration() || GV->isWeakForLinker()) &&
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc));
+ if (MemIntName)
+ MIB.addExternalSymbol(MemIntName, OpFlags);
+ else
+ MIB.addGlobalAddress(GV, 0, OpFlags);
+ }
+
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (Subtarget->isPICStyleGOT())
+ MIB.addReg(X86::EBX, RegState::Implicit);
+
+ if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+ MIB.addReg(X86::AL, RegState::Implicit);
+
+ // Add implicit physical register uses to the call.
+ for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+ MIB.addReg(RegArgs[i], RegState::Implicit);
+
+ // Issue CALLSEQ_END
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
+ .addImm(NumBytes).addImm(NumBytesCallee);
+
+ // Build info for return calling conv lowering code.
+ // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo.
+ SmallVector<ISD::InputArg, 32> Ins;
+ SmallVector<EVT, 4> RetTys;
+ ComputeValueVTs(TLI, I->getType(), RetTys);
+ for (unsigned i = 0, e = RetTys.size(); i != e; ++i) {
+ EVT VT = RetTys[i];
+ MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
+ unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT);
+ for (unsigned j = 0; j != NumRegs; ++j) {
+ ISD::InputArg MyFlags;
+ MyFlags.VT = RegisterVT;
+ MyFlags.Used = !CS.getInstruction()->use_empty();
+ if (CS.paramHasAttr(0, Attribute::SExt))
+ MyFlags.Flags.setSExt();
+ if (CS.paramHasAttr(0, Attribute::ZExt))
+ MyFlags.Flags.setZExt();
+ if (CS.paramHasAttr(0, Attribute::InReg))
+ MyFlags.Flags.setInReg();
+ Ins.push_back(MyFlags);
+ }
+ }
+
+ // Now handle call return values.
+ SmallVector<unsigned, 4> UsedRegs;
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
+ I->getParent()->getContext());
+ unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
+ CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ EVT CopyVT = RVLocs[i].getValVT();
+ unsigned CopyReg = ResultReg + i;
+
+ // If this is a call to a function that returns an fp value on the x87 fp
+ // stack, but where we prefer to use the value in xmm registers, copy it
+ // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+ if ((RVLocs[i].getLocReg() == X86::ST0 ||
+ RVLocs[i].getLocReg() == X86::ST1)) {
+ if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
+ CopyVT = MVT::f80;
+ CopyReg = createResultReg(&X86::RFP80RegClass);
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
+ CopyReg);
+ } else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ CopyReg).addReg(RVLocs[i].getLocReg());
+ UsedRegs.push_back(RVLocs[i].getLocReg());
+ }
+
+ if (CopyVT != RVLocs[i].getValVT()) {
+ // Round the F80 the right size, which also moves to the appropriate xmm
+ // register. This is accomplished by storing the F80 value in memory and
+ // then loading it back. Ewww...
+ EVT ResVT = RVLocs[i].getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Opc)), FI)
+ .addReg(CopyReg);
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Opc), ResultReg + i), FI);
+ }
+ }
+
+ if (RVLocs.size())
+ UpdateValueMap(I, ResultReg, RVLocs.size());
+
+ // Set all unused physreg defs as dead.
+ static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+ return true;
+}
+
+
+bool
+X86FastISel::TargetSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default: break;
+ case Instruction::Load:
+ return X86SelectLoad(I);
+ case Instruction::Store:
+ return X86SelectStore(I);
+ case Instruction::Ret:
+ return X86SelectRet(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return X86SelectCmp(I);
+ case Instruction::ZExt:
+ return X86SelectZExt(I);
+ case Instruction::Br:
+ return X86SelectBranch(I);
+ case Instruction::Call:
+ return X86SelectCall(I);
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ return X86SelectShift(I);
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return X86SelectDivRem(I);
+ case Instruction::Select:
+ return X86SelectSelect(I);
+ case Instruction::Trunc:
+ return X86SelectTrunc(I);
+ case Instruction::FPExt:
+ return X86SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return X86SelectFPTrunc(I);
+ case Instruction::IntToPtr: // Deliberate fall-through.
+ case Instruction::PtrToInt: {
+ EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(I->getType());
+ if (DstVT.bitsGT(SrcVT))
+ return X86SelectZExt(I);
+ if (DstVT.bitsLT(SrcVT))
+ return X86SelectTrunc(I);
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0) return false;
+ UpdateValueMap(I, Reg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
+ MVT VT;
+ if (!isTypeLegal(C->getType(), VT))
+ return 0;
+
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return 0;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = &X86::GR8RegClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = &X86::GR16RegClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ // Materialize addresses with LEA instructions.
+ if (isa<GlobalValue>(C)) {
+ X86AddressMode AM;
+ if (X86SelectAddress(C, AM)) {
+ // If the expression is just a basereg, then we're done, otherwise we need
+ // to emit an LEA.
+ if (AM.BaseType == X86AddressMode::RegBase &&
+ AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+ return AM.Base.Reg;
+
+ Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+ unsigned ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+ }
+ return 0;
+ }
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = TD.getPrefTypeAlignment(C->getType());
+ if (Align == 0) {
+ // Alignment of vector types. FIXME!
+ Align = TD.getTypeAllocSize(C->getType());
+ }
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ unsigned char OpFlag = 0;
+ if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOTOFF;
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ } else if (Subtarget->isPICStyleRIPRel() &&
+ TM.getCodeModel() == CodeModel::Small) {
+ PICBase = X86::RIP;
+ }
+
+ // Create the load from the constant pool.
+ unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+ unsigned ResultReg = createResultReg(RC);
+ addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Opc), ResultReg),
+ MCPOffset, PICBase, OpFlag);
+
+ return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
+ // Fail on dynamic allocas. At this point, getRegForValue has already
+ // checked its CSE maps, so if we're here trying to handle a dynamic
+ // alloca, we're not going to succeed. X86SelectAddress has a
+ // check for dynamic allocas, because it's called directly from
+ // various places, but TargetMaterializeAlloca also needs a check
+ // in order to avoid recursion between getRegForValue,
+ // X86SelectAddrss, and TargetMaterializeAlloca.
+ if (!FuncInfo.StaticAllocaMap.count(C))
+ return 0;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(C, AM))
+ return 0;
+ unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+ const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+ unsigned ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
+ MVT VT;
+ if (!isTypeLegal(CF->getType(), VT))
+ return 0;
+
+ // Get opcode and regclass for the given zero.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ Opc = X86::FsFLD0SS;
+ RC = &X86::FR32RegClass;
+ } else {
+ Opc = X86::LD_Fp032;
+ RC = &X86::RFP32RegClass;
+ }
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64) {
+ Opc = X86::FsFLD0SD;
+ RC = &X86::FR64RegClass;
+ } else {
+ Opc = X86::LD_Fp064;
+ RC = &X86::RFP64RegClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+ return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
+ X86AddressMode AM;
+ if (!X86SelectAddress(LI->getOperand(0), AM))
+ return false;
+
+ const X86InstrInfo &XII = (const X86InstrInfo&)TII;
+
+ unsigned Size = TD.getTypeAllocSize(LI->getType());
+ unsigned Alignment = LI->getAlignment();
+
+ SmallVector<MachineOperand, 8> AddrOps;
+ AM.getFullAddress(AddrOps);
+
+ MachineInstr *Result =
+ XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
+ if (Result == 0) return false;
+
+ FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
+ MI->eraseFromParent();
+ return true;
+}
+
+
+namespace llvm {
+ FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ return new X86FastISel(funcInfo, libInfo);
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 0000000..0dd034c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,253 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will find instructions which
+// can be re-written as LEA instructions in order to reduce pipeline
+// delays for some models of the Intel Atom family.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+ class FixupLEAPass : public MachineFunctionPass {
+ enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+ static char ID;
+ /// \brief Loop over all of the instructions in the basic block
+ /// replacing applicable instructions with LEA instructions,
+ /// where appropriate.
+ bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+ virtual const char *getPassName() const { return "X86 Atom LEA Fixup";}
+
+ /// \brief Given a machine register, look for the instruction
+ /// which writes it in the current basic block. If found,
+ /// try to replace it with an equivalent LEA instruction.
+ /// If replacement succeeds, then also process the the newly created
+ /// instruction.
+ void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Given a memory access or LEA instruction
+ /// whose address mode uses a base and/or index register, look for
+ /// an opportunity to replace the instruction which sets the base or index
+ /// register with an equivalent LEA instruction.
+ void processInstruction(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Determine if an instruction references a machine register
+ /// and, if so, whether it reads or writes the register.
+ RegUsageState usesRegister(MachineOperand& p,
+ MachineBasicBlock::iterator I);
+
+ /// \brief Step backwards through a basic block, looking
+ /// for an instruction which writes a register within
+ /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+ MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
+ MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
+ /// \brief if an instruction can be converted to an
+ /// equivalent LEA, insert the new instruction into the basic block
+ /// and return a pointer to it. Otherwise, return zero.
+ MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const;
+
+ public:
+ FixupLEAPass() : MachineFunctionPass(ID) {}
+
+ /// \brief Loop over all of the basic blocks,
+ /// replacing instructions by equivalent LEA instructions
+ /// if needed and when possible.
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ private:
+ MachineFunction *MF;
+ const TargetMachine *TM;
+ const TargetInstrInfo *TII; // Machine instruction info.
+
+ };
+ char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const {
+ MachineInstr* MI = MBBI;
+ MachineInstr* NewMI;
+ switch (MI->getOpcode()) {
+ case X86::MOV32rr:
+ case X86::MOV64rr: {
+ const MachineOperand& Src = MI->getOperand(1);
+ const MachineOperand& Dest = MI->getOperand(0);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(),
+ TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
+ .addOperand(Dest)
+ .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (!MI->getOperand(2).isImm()) {
+ // convertToThreeAddress will call getImm()
+ // which requires isImm() to be true
+ return 0;
+ }
+ }
+ return TII->convertToThreeAddress(MFI, MBBI, 0);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() {
+ return new FixupLEAPass();
+}
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+ MF = &Func;
+ TII = Func.getTarget().getInstrInfo();
+ TM = &MF->getTarget();
+
+ DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+ // Process all basic blocks.
+ for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+ processBasicBlock(Func, I);
+ DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+ return true;
+}
+
+FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
+ MachineBasicBlock::iterator I) {
+ RegUsageState RegUsage = RU_NotUsed;
+ MachineInstr* MI = I;
+
+ for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+ MachineOperand& opnd = MI->getOperand(i);
+ if (opnd.isReg() && opnd.getReg() == p.getReg()){
+ if (opnd.isDef())
+ return RU_Write;
+ RegUsage = RU_Read;
+ }
+ }
+ return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ if (I == MFI->begin()) {
+ if (MFI->isPredecessor(MFI)) {
+ I = --MFI->end();
+ return true;
+ }
+ else
+ return false;
+ }
+ --I;
+ return true;
+}
+
+MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
+ MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ int InstrDistance = 1;
+ MachineBasicBlock::iterator CurInst;
+ static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+ CurInst = I;
+ bool Found;
+ Found = getPreviousInstr(CurInst, MFI);
+ while( Found && I != CurInst) {
+ if (CurInst->isCall() || CurInst->isInlineAsm())
+ break;
+ if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+ break; // too far back to make a difference
+ if (usesRegister(p, CurInst) == RU_Write){
+ return CurInst;
+ }
+ InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
+ Found = getPreviousInstr(CurInst, MFI);
+ }
+ return 0;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ // Process a load, store, or LEA instruction.
+ MachineInstr *MI = I;
+ int opcode = MI->getOpcode();
+ const MCInstrDesc& Desc = MI->getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
+ if (AddrOffset >= 0) {
+ AddrOffset += X86II::getOperandBias(Desc);
+ MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+ if (p.isReg() && p.getReg() != X86::ESP) {
+ seekLEAFixup(p, I, MFI);
+ }
+ MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ if (q.isReg() && q.getReg() != X86::ESP) {
+ seekLEAFixup(q, I, MFI);
+ }
+ }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand& p,
+ MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+ if (MBI) {
+ MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
+ if (NewMI) {
+ ++NumLEAs;
+ DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
+ // now to replace with an equivalent LEA...
+ DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
+ MFI->erase(MBI);
+ MachineBasicBlock::iterator J =
+ static_cast<MachineBasicBlock::iterator> (NewMI);
+ processInstruction(J, MFI);
+ }
+ }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+ MachineFunction::iterator MFI) {
+
+ for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
+ processInstruction(I, MFI);
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..0585b43
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1768 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ struct FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass(ID) {
+ initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+ // This is really only to keep valgrind quiet.
+ // The logic in isLive() is too much for it.
+ memset(Stack, 0, sizeof(Stack));
+ memset(RegMap, 0, sizeof(RegMap));
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<EdgeBundles>();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+
+ // Two CFG edges are related if they leave the same block, or enter the same
+ // block. The transitive closure of an edge under this relation is a
+ // LiveBundle. It represents a set of CFG edges where the live FP stack
+ // registers must be allocated identically in the x87 stack.
+ //
+ // A LiveBundle is usually all the edges leaving a block, or all the edges
+ // entering a block, but it can contain more edges if critical edges are
+ // present.
+ //
+ // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+ // but the exact mapping of FP registers to stack slots is fixed later.
+ struct LiveBundle {
+ // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+ unsigned Mask;
+
+ // Number of pre-assigned live registers in FixStack. This is 0 when the
+ // stack order has not yet been fixed.
+ unsigned FixCount;
+
+ // Assigned stack order for live-in registers.
+ // FixStack[i] == getStackEntry(i) for all i < FixCount.
+ unsigned char FixStack[8];
+
+ LiveBundle() : Mask(0), FixCount(0) {}
+
+ // Have the live registers been assigned a stack order yet?
+ bool isFixed() const { return !Mask || FixCount; }
+ };
+
+ // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+ // with no live FP registers.
+ SmallVector<LiveBundle, 8> LiveBundles;
+
+ // The edge bundle analysis provides indices into the LiveBundles vector.
+ EdgeBundles *Bundles;
+
+ // Return a bitmask of FP registers in block's live-in list.
+ static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+ unsigned Mask = 0;
+ for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+ E = MBB->livein_end(); I != E; ++I) {
+ unsigned Reg = *I - X86::FP0;
+ if (Reg < 8)
+ Mask |= 1 << Reg;
+ }
+ return Mask;
+ }
+
+ // Partition all the CFG edges into LiveBundles.
+ void bundleCFG(MachineFunction &MF);
+
+ MachineBasicBlock *MBB; // Current basic block
+
+ // The hardware keeps track of how many FP registers are live, so we have
+ // to model that exactly. Usually, each live register corresponds to an
+ // FP<n> register, but when dealing with calls, returns, and inline
+ // assembly, it is sometimes necessary to have live scratch registers.
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned StackTop; // The current top of the FP stack.
+
+ enum {
+ NumFPRegs = 16 // Including scratch pseudo-registers.
+ };
+
+ // For each live FP<n> register, point to its Stack[] entry.
+ // The first entries correspond to FP0-FP6, the rest are scratch registers
+ // used when we need slightly different live registers than what the
+ // register allocator thinks.
+ unsigned RegMap[NumFPRegs];
+
+ // Pending fixed registers - Inline assembly needs FP registers to appear
+ // in fixed stack slot positions. This is handled by copying FP registers
+ // to ST registers before the instruction, and copying back after the
+ // instruction.
+ //
+ // This is modeled with pending ST registers. NumPendingSTs is the number
+ // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
+ // register that holds the ST value. The ST registers are not moved into
+ // place until immediately before the instruction that needs them.
+ //
+ // It can happen that we need an ST register to be live when no FP register
+ // holds the value:
+ //
+ // %ST0 = COPY %FP4<kill>
+ //
+ // When that happens, we allocate a scratch FP register to hold the ST
+ // value. That means every register in PendingST must be live.
+
+ unsigned NumPendingSTs;
+ unsigned char PendingST[8];
+
+ // Set up our stack model to match the incoming registers to MBB.
+ void setupBlockStack();
+
+ // Shuffle live registers to match the expectations of successor blocks.
+ void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dumpStack() const {
+ dbgs() << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ dbgs() << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ for (unsigned i = 0; i != NumPendingSTs; ++i)
+ dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
+ dbgs() << "\n";
+ }
+#endif
+
+ /// getSlot - Return the stack slot number a particular register number is
+ /// in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < NumFPRegs && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ /// isLive - Is RegNo currently live in the stack?
+ bool isLive(unsigned RegNo) const {
+ unsigned Slot = getSlot(RegNo);
+ return Slot < StackTop && Stack[Slot] == RegNo;
+ }
+
+ /// getScratchReg - Return an FP register that is not currently in use.
+ unsigned getScratchReg() const {
+ for (int i = NumFPRegs - 1; i >= 8; --i)
+ if (!isLive(i))
+ return i;
+ llvm_unreachable("Ran out of scratch FP registers");
+ }
+
+ /// isScratchReg - Returns trus if RegNo is a scratch FP register.
+ static bool isScratchReg(unsigned RegNo) {
+ return RegNo > 8 && RegNo < NumFPRegs;
+ }
+
+ /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ if (STi >= StackTop)
+ report_fatal_error("Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ /// getSTReg - Return the X86::ST(i) register which contains the specified
+ /// FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < NumFPRegs && "Register number out of range!");
+ if (StackTop >= 8)
+ report_fatal_error("Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ if (isAtTop(RegNo)) return;
+
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ if (RegMap[RegOnTop] >= StackTop)
+ report_fatal_error("Access past stack top!");
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ ++NumFXCH;
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
+ /// RegNo. If any PendingST registers still need the RegNo value, duplicate
+ /// them to new scratch registers.
+ void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
+ for (unsigned i = 0; i != NumPendingSTs; ++i) {
+ if (PendingST[i] != RegNo)
+ continue;
+ unsigned SR = getScratchReg();
+ DEBUG(dbgs() << "Duplicating pending ST" << i
+ << " in FP" << RegNo << " to FP" << SR << '\n');
+ duplicateToTop(RegNo, SR, I);
+ PendingST[i] = SR;
+ }
+ }
+
+ /// popStackAfter - Pop the current value off of the top of the FP stack
+ /// after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ /// freeStackSlotAfter - Free the specified register from the register
+ /// stack, so that it is no longer in a register. If the register is
+ /// currently at the top of the stack, we just pop the current instruction,
+ /// otherwise we store the current top-of-stack into the specified slot,
+ /// then pop the top of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+ /// instruction.
+ MachineBasicBlock::iterator
+ freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+ /// Adjust the live registers to be the set in Mask.
+ void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+ /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+ /// st(0), FP reg FixStack[1] is st(1) etc.
+ void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+ MachineBasicBlock::iterator I);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+ // Check if a COPY instruction is using FP registers.
+ static bool isFPCopy(MachineInstr *MI) {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+
+ return X86::RFP80RegClass.contains(DstReg) ||
+ X86::RFP80RegClass.contains(SrcReg);
+ }
+ };
+ char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isReg() && "Expected an FP register!");
+ unsigned Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+ for (unsigned i = 0; i <= 6; ++i)
+ if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ Bundles = &getAnalysis<EdgeBundles>();
+ TII = MF.getTarget().getInstrInfo();
+
+ // Prepare cross-MBB liveness.
+ bundleCFG(MF);
+
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ SmallPtrSet<MachineBasicBlock*, 8> Processed;
+ MachineBasicBlock *Entry = MF.begin();
+
+ bool Changed = false;
+ for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+ I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+ I != E; ++I)
+ Changed |= processBasicBlock(MF, **I);
+
+ // Process any unreachable blocks in arbitrary order now.
+ if (MF.size() != Processed.size())
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+ if (Processed.insert(BB))
+ Changed |= processBasicBlock(MF, *BB);
+
+ LiveBundles.clear();
+
+ return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+ assert(LiveBundles.empty() && "Stale data in LiveBundles");
+ LiveBundles.resize(Bundles->getNumBundles());
+
+ // Gather the actual live-in masks for all MBBs.
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ MachineBasicBlock *MBB = I;
+ const unsigned Mask = calcLiveInMask(MBB);
+ if (!Mask)
+ continue;
+ // Update MBB ingoing bundle mask.
+ LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask;
+ }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+ NumPendingSTs = 0;
+
+ setupBlockStack();
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr *MI = I;
+ uint64_t Flags = MI->getDesc().TSFlags;
+
+ unsigned FPInstClass = Flags & X86II::FPTypeMask;
+ if (MI->isInlineAsm())
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI->isCopy() && isFPCopy(MI))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI->isImplicitDef() &&
+ X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+ FPInstClass = X86II::SpecialFP;
+
+ if (FPInstClass == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = 0;
+ if (I != BB.begin())
+ PrevMI = prior(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (FPInstClass) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: llvm_unreachable("Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ DEBUG(
+ MachineBasicBlock::iterator PrevI(PrevMI);
+ if (I == PrevI) {
+ dbgs() << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+ dbgs() << "Inserted instructions:\n\t";
+ Start->print(dbgs(), &MF.getTarget());
+ while (++Start != llvm::next(I)) {}
+ }
+ dumpStack();
+ );
+ (void)PrevMI;
+
+ Changed = true;
+ }
+
+ finishBlockStack();
+
+ return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+ DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+ << " derived from " << MBB->getName() << ".\n");
+ StackTop = 0;
+ // Get the live-in bundle for MBB.
+ const LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+ if (!Bundle.Mask) {
+ DEBUG(dbgs() << "Block has no FP live-ins.\n");
+ return;
+ }
+
+ // Depth-first iteration should ensure that we always have an assigned stack.
+ assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+ // Push the fixed live-in registers.
+ for (unsigned i = Bundle.FixCount; i > 0; --i) {
+ MBB->addLiveIn(X86::ST0+i-1);
+ DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+ << unsigned(Bundle.FixStack[i-1]) << '\n');
+ pushReg(Bundle.FixStack[i-1]);
+ }
+
+ // Kill off unwanted live-ins. This can happen with a critical edge.
+ // FIXME: We could keep these live registers around as zombies. They may need
+ // to be revived at the end of a short block. It might save a few instrs.
+ adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+ DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+ // The RET handling below takes care of return blocks for us.
+ if (MBB->succ_empty())
+ return;
+
+ DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+ << " derived from " << MBB->getName() << ".\n");
+
+ // Get MBB's live-out bundle.
+ unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+ LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+ // We may need to kill and define some registers to match successors.
+ // FIXME: This can probably be combined with the shuffle below.
+ MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+ adjustLiveRegs(Bundle.Mask, Term);
+
+ if (!Bundle.Mask) {
+ DEBUG(dbgs() << "No live-outs.\n");
+ return;
+ }
+
+ // Has the stack order been fixed yet?
+ DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+ if (Bundle.isFixed()) {
+ DEBUG(dbgs() << "Shuffling stack to match.\n");
+ shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+ } else {
+ // Not fixed yet, we get to choose.
+ DEBUG(dbgs() << "Fixing stack order now.\n");
+ Bundle.FixCount = StackTop;
+ for (unsigned i = 0; i < StackTop; ++i)
+ Bundle.FixStack[i] = getStackEntry(i);
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ uint16_t from;
+ uint16_t to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+ const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+ for (unsigned i = 0; i != NumEntries-1; ++i)
+ if (!(Table[i] < Table[i+1])) return false;
+ return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+ if (I != Table+N && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ABS_Fp80 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m64 , X86::ADD_F64m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CHS_Fp80 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVB_Fp80 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVE_Fp80 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::CMOVP_Fp80 , X86::CMOVP_F },
+ { X86::COS_Fp32 , X86::COS_F },
+ { X86::COS_Fp64 , X86::COS_F },
+ { X86::COS_Fp80 , X86::COS_F },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m64 , X86::DIV_F64m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp16m80 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp32m80 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ILD_Fp64m80 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp16m80 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp32m80 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::IST_Fp64m80 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp080 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp180 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp32m64 , X86::LD_F32m },
+ { X86::LD_Fp32m80 , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::LD_Fp64m80 , X86::LD_F64m },
+ { X86::LD_Fp80m , X86::LD_F80m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m64 , X86::MUL_F64m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+ { X86::SIN_Fp32 , X86::SIN_F },
+ { X86::SIN_Fp64 , X86::SIN_F },
+ { X86::SIN_Fp80 , X86::SIN_F },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::SQRT_Fp80 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::ST_Fp80m32 , X86::ST_F32m },
+ { X86::ST_Fp80m64 , X86::ST_F64m },
+ { X86::ST_FpP80m , X86::ST_FP80m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m64 , X86::SUB_F64m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::TST_Fp80 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr80 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr80 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ MachineInstr* MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ ASSERT_SORTED(PopTable);
+ if (StackTop == 0)
+ report_fatal_error("Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+ if (Opcode != -1) {
+ I->setDesc(TII->get(Opcode));
+ if (Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+ unsigned Defs = Mask;
+ unsigned Kills = 0;
+ for (unsigned i = 0; i < StackTop; ++i) {
+ unsigned RegNo = Stack[i];
+ if (!(Defs & (1 << RegNo)))
+ // This register is live, but we don't want it.
+ Kills |= (1 << RegNo);
+ else
+ // We don't need to imp-def this live register.
+ Defs &= ~(1 << RegNo);
+ }
+ assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+ // Produce implicit-defs for free by using killed registers.
+ while (Kills && Defs) {
+ unsigned KReg = CountTrailingZeros_32(Kills);
+ unsigned DReg = CountTrailingZeros_32(Defs);
+ DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+ std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+ std::swap(RegMap[KReg], RegMap[DReg]);
+ Kills &= ~(1 << KReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Kill registers by popping.
+ if (Kills && I != MBB->begin()) {
+ MachineBasicBlock::iterator I2 = llvm::prior(I);
+ while (StackTop) {
+ unsigned KReg = getStackEntry(0);
+ if (!(Kills & (1 << KReg)))
+ break;
+ DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+ popStackAfter(I2);
+ Kills &= ~(1 << KReg);
+ }
+ }
+
+ // Manually kill the rest.
+ while (Kills) {
+ unsigned KReg = CountTrailingZeros_32(Kills);
+ DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+ freeStackSlotBefore(I, KReg);
+ Kills &= ~(1 << KReg);
+ }
+
+ // Load zeros for all the imp-defs.
+ while(Defs) {
+ unsigned DReg = CountTrailingZeros_32(Defs);
+ DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+ BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+ pushReg(DReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Now we should have the correct registers live.
+ DEBUG(dumpStack());
+ assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+ unsigned FixCount,
+ MachineBasicBlock::iterator I) {
+ // Move items into place, starting from the desired stack bottom.
+ while (FixCount--) {
+ // Old register at position FixCount.
+ unsigned OldReg = getStackEntry(FixCount);
+ // Desired register at position FixCount.
+ unsigned Reg = FixStack[FixCount];
+ if (Reg == OldReg)
+ continue;
+ // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+ moveToTop(Reg, I);
+ if (FixCount > 0)
+ moveToTop(OldReg, I);
+ }
+ DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned DestReg = getFPReg(MI->getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ if (KillsSrc)
+ duplicatePendingSTBeforeKill(Reg, I);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+ //
+ if (!KillsSrc &&
+ (MI->getOpcode() == X86::IST_Fp64m32 ||
+ MI->getOpcode() == X86::ISTT_Fp16m32 ||
+ MI->getOpcode() == X86::ISTT_Fp32m32 ||
+ MI->getOpcode() == X86::ISTT_Fp64m32 ||
+ MI->getOpcode() == X86::IST_Fp64m64 ||
+ MI->getOpcode() == X86::ISTT_Fp16m64 ||
+ MI->getOpcode() == X86::ISTT_Fp32m64 ||
+ MI->getOpcode() == X86::ISTT_Fp64m64 ||
+ MI->getOpcode() == X86::IST_Fp64m80 ||
+ MI->getOpcode() == X86::ISTT_Fp16m80 ||
+ MI->getOpcode() == X86::ISTT_Fp32m80 ||
+ MI->getOpcode() == X86::ISTT_Fp64m80 ||
+ MI->getOpcode() == X86::ST_FpP80m)) {
+ duplicateToTop(Reg, getScratchReg(), I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ if (MI->getOpcode() == X86::IST_FP64m ||
+ MI->getOpcode() == X86::ISTT_FP16m ||
+ MI->getOpcode() == X86::ISTT_FP32m ||
+ MI->getOpcode() == X86::ISTT_FP64m ||
+ MI->getOpcode() == X86::ST_FP80m) {
+ if (StackTop == 0)
+ report_fatal_error("Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+#ifndef NDEBUG
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ if (KillsSrc) {
+ duplicatePendingSTBeforeKill(Reg, I);
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ if (StackTop == 0)
+ report_fatal_error("Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI->getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(1); // Drop the source operand.
+ MI->RemoveOperand(0); // Drop the destination operand.
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::ADD_Fp80 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::DIV_Fp80 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::MUL_Fp80 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+ { X86::SUB_Fp80 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::DIV_Fp80 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+ { X86::SUB_Fp80 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp80 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::ADD_Fp80 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::DIV_Fp80 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::MUL_Fp80 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+ { X86::SUB_Fp80 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI->getOperand(0));
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+ DebugLoc dl = MI->getDebugLoc();
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ const TableEntry *InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+ MI->getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(I++);
+ I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->RemoveOperand(1);
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+
+ unsigned Op0 = getFPReg(MI->getOperand(0));
+ unsigned Op1 = getFPReg(MI->getOperand(2));
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0);
+ MI->RemoveOperand(1);
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unknown SpecialFP instruction!");
+ case TargetOpcode::COPY: {
+ // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+ const MachineOperand &MO1 = MI->getOperand(1);
+ const MachineOperand &MO0 = MI->getOperand(0);
+ unsigned DstST = MO0.getReg() - X86::ST0;
+ unsigned SrcST = MO1.getReg() - X86::ST0;
+ bool KillsSrc = MI->killsRegister(MO1.getReg());
+
+ // ST = COPY FP. Set up a pending ST register.
+ if (DstST < 8) {
+ unsigned SrcFP = getFPReg(MO1);
+ assert(isLive(SrcFP) && "Cannot copy dead register");
+ assert(!MO0.isDead() && "Cannot copy to dead ST register");
+
+ // Unallocated STs are marked as the nonexistent FP255.
+ while (NumPendingSTs <= DstST)
+ PendingST[NumPendingSTs++] = NumFPRegs;
+
+ // STi could still be live from a previous inline asm.
+ if (isScratchReg(PendingST[DstST])) {
+ DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
+ << '\n');
+ freeStackSlotBefore(MI, PendingST[DstST]);
+ }
+
+ // When the source is killed, allocate a scratch FP register.
+ if (KillsSrc) {
+ duplicatePendingSTBeforeKill(SrcFP, I);
+ unsigned Slot = getSlot(SrcFP);
+ unsigned SR = getScratchReg();
+ PendingST[DstST] = SR;
+ Stack[Slot] = SR;
+ RegMap[SR] = Slot;
+ } else
+ PendingST[DstST] = SrcFP;
+ break;
+ }
+
+ // FP = COPY ST. Extract fixed stack value.
+ // Any instruction defining ST registers must have assigned them to a
+ // scratch register.
+ if (SrcST < 8) {
+ unsigned DstFP = getFPReg(MO0);
+ assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
+ assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
+ unsigned SrcFP = PendingST[SrcST];
+ assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
+ assert(isLive(SrcFP) && "Scratch holding ST is dead");
+
+ // DstFP steals the stack slot from SrcFP.
+ unsigned Slot = getSlot(SrcFP);
+ Stack[Slot] = DstFP;
+ RegMap[DstFP] = Slot;
+
+ // Always treat the ST as killed.
+ PendingST[SrcST] = NumFPRegs;
+ while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
+ --NumPendingSTs;
+ break;
+ }
+
+ // FP <- FP copy.
+ unsigned DstFP = getFPReg(MO0);
+ unsigned SrcFP = getFPReg(MO1);
+ assert(isLive(SrcFP) && "Cannot copy dead register");
+ if (KillsSrc) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcFP);
+ Stack[Slot] = DstFP;
+ RegMap[DstFP] = Slot;
+ } else {
+ // For COPY we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcFP, DstFP, I);
+ }
+ break;
+ }
+
+ case TargetOpcode::IMPLICIT_DEF: {
+ // All FP registers must be explicitly defined, so load a 0 instead.
+ unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+ DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+ BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+ pushReg(Reg);
+ break;
+ }
+
+ case X86::FpPOP_RETVAL: {
+ // The FpPOP_RETVAL instruction is used after calls that return a value on
+ // the floating point stack. We cannot model this with ST defs since CALL
+ // instructions have fixed clobber lists. This instruction is interpreted
+ // to mean that there is one more live register on the stack than we
+ // thought.
+ //
+ // This means that StackTop does not match the hardware stack between a
+ // call and the FpPOP_RETVAL instructions. We do tolerate FP instructions
+ // between CALL and FpPOP_RETVAL as long as they don't overflow the
+ // hardware stack.
+ unsigned DstFP = getFPReg(MI->getOperand(0));
+
+ // Move existing stack elements up to reflect reality.
+ assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
+ if (StackTop) {
+ std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
+ for (unsigned i = 0; i != NumFPRegs; ++i)
+ ++RegMap[i];
+ }
+ ++StackTop;
+
+ // DstFP is the new bottom of the stack.
+ Stack[0] = DstFP;
+ RegMap[DstFP] = 0;
+
+ // DstFP will be killed by processBasicBlock if this was a dead def.
+ break;
+ }
+
+ case TargetOpcode::INLINEASM: {
+ // The inline asm MachineInstr currently only *uses* FP registers for the
+ // 'f' constraint. These should be turned into the current ST(x) register
+ // in the machine instr.
+ //
+ // There are special rules for x87 inline assembly. The compiler must know
+ // exactly how many registers are popped and pushed implicitly by the asm.
+ // Otherwise it is not possible to restore the stack state after the inline
+ // asm.
+ //
+ // There are 3 kinds of input operands:
+ //
+ // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+ // popped input operand must be in a fixed stack slot, and it is either
+ // tied to an output operand, or in the clobber list. The MI has ST use
+ // and def operands for these inputs.
+ //
+ // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+ // preserved by the inline asm. The fixed stack slots must be STn-STm
+ // following the popped inputs. A fixed input operand cannot be tied to
+ // an output or appear in the clobber list. The MI has ST use operands
+ // and no defs for these inputs.
+ //
+ // 3. Preserved inputs. These inputs use the "f" constraint which is
+ // represented as an FP register. The inline asm won't change these
+ // stack slots.
+ //
+ // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+ // registers do not count as output operands. The inline asm changes the
+ // stack as if it popped all the popped inputs and then pushed all the
+ // output operands.
+
+ // Scan the assembly for ST registers used, defined and clobbered. We can
+ // only tell clobbers from defs by looking at the asm descriptor.
+ unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+ unsigned NumOps = 0;
+ for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
+ i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
+ unsigned Flags = MI->getOperand(i).getImm();
+ NumOps = InlineAsm::getNumOperandRegisters(Flags);
+ if (NumOps != 1)
+ continue;
+ const MachineOperand &MO = MI->getOperand(i + 1);
+ if (!MO.isReg())
+ continue;
+ unsigned STReg = MO.getReg() - X86::ST0;
+ if (STReg >= 8)
+ continue;
+
+ switch (InlineAsm::getKind(Flags)) {
+ case InlineAsm::Kind_RegUse:
+ STUses |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_RegDef:
+ case InlineAsm::Kind_RegDefEarlyClobber:
+ STDefs |= (1u << STReg);
+ if (MO.isDead())
+ STDeadDefs |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_Clobber:
+ STClobbers |= (1u << STReg);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (STUses && !isMask_32(STUses))
+ MI->emitError("fixed input regs must be last on the x87 stack");
+ unsigned NumSTUses = CountTrailingOnes_32(STUses);
+
+ // Defs must be contiguous from the stack top. ST0-STn.
+ if (STDefs && !isMask_32(STDefs)) {
+ MI->emitError("output regs must be last on the x87 stack");
+ STDefs = NextPowerOf2(STDefs) - 1;
+ }
+ unsigned NumSTDefs = CountTrailingOnes_32(STDefs);
+
+ // So must the clobbered stack slots. ST0-STm, m >= n.
+ if (STClobbers && !isMask_32(STDefs | STClobbers))
+ MI->emitError("clobbers must be last on the x87 stack");
+
+ // Popped inputs are the ones that are also clobbered or defined.
+ unsigned STPopped = STUses & (STDefs | STClobbers);
+ if (STPopped && !isMask_32(STPopped))
+ MI->emitError("implicitly popped regs must be last on the x87 stack");
+ unsigned NumSTPopped = CountTrailingOnes_32(STPopped);
+
+ DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+ << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+ // Scan the instruction for FP uses corresponding to "f" constraints.
+ // Collect FP registers to kill afer the instruction.
+ // Always kill all the scratch regs.
+ unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+ unsigned FPUsed = 0;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ if (!Op.isUse())
+ MI->emitError("illegal \"f\" output constraint");
+ unsigned FPReg = getFPReg(Op);
+ FPUsed |= 1U << FPReg;
+
+ // If we kill this operand, make sure to pop it from the stack after the
+ // asm. We just remember it for now, and pop them all off at the end in
+ // a batch.
+ if (Op.isKill())
+ FPKills |= 1U << FPReg;
+ }
+
+ // The popped inputs will be killed by the instruction, so duplicate them
+ // if the FP register needs to be live after the instruction, or if it is
+ // used in the instruction itself. We effectively treat the popped inputs
+ // as early clobbers.
+ for (unsigned i = 0; i < NumSTPopped; ++i) {
+ if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
+ continue;
+ unsigned SR = getScratchReg();
+ duplicateToTop(PendingST[i], SR, I);
+ DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
+ << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
+ PendingST[i] = SR;
+ }
+
+ // Make sure we have a unique live register for every fixed use. Some of
+ // them could be undef uses, and we need to emit LD_F0 instructions.
+ for (unsigned i = 0; i < NumSTUses; ++i) {
+ if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
+ // Check for shared assignments.
+ for (unsigned j = 0; j < i; ++j) {
+ if (PendingST[j] != PendingST[i])
+ continue;
+ // STi and STj are inn the same register, create a copy.
+ unsigned SR = getScratchReg();
+ duplicateToTop(PendingST[i], SR, I);
+ DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
+ << unsigned(PendingST[i])
+ << " to avoid collision with ST" << j << '\n');
+ PendingST[i] = SR;
+ }
+ continue;
+ }
+ unsigned SR = getScratchReg();
+ DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
+ BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+ pushReg(SR);
+ PendingST[i] = SR;
+ if (NumPendingSTs == i)
+ ++NumPendingSTs;
+ }
+ assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+
+ // Now we can rearrange the live registers to match what was requested.
+ shuffleStackTop(PendingST, NumPendingSTs, I);
+ DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+ // With the stack layout fixed, rewrite the FP registers.
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ unsigned FPReg = getFPReg(Op);
+ Op.setReg(getSTReg(FPReg));
+ }
+
+ // Simulate the inline asm popping its inputs and pushing its outputs.
+ StackTop -= NumSTPopped;
+
+ // Hold the fixed output registers in scratch FP registers. They will be
+ // transferred to real FP registers by copies.
+ NumPendingSTs = 0;
+ for (unsigned i = 0; i < NumSTDefs; ++i) {
+ unsigned SR = getScratchReg();
+ pushReg(SR);
+ FPKills &= ~(1u << SR);
+ }
+ for (unsigned i = 0; i < NumSTDefs; ++i)
+ PendingST[NumPendingSTs++] = getStackEntry(i);
+ DEBUG({dbgs() << "After asm: "; dumpStack();});
+
+ // If any of the ST defs were dead, pop them immediately. Our caller only
+ // handles dead FP defs.
+ MachineBasicBlock::iterator InsertPt = MI;
+ for (unsigned i = 0; STDefs & (1u << i); ++i) {
+ if (!(STDeadDefs & (1u << i)))
+ continue;
+ freeStackSlotAfter(InsertPt, PendingST[i]);
+ PendingST[i] = NumFPRegs;
+ }
+ while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
+ --NumPendingSTs;
+
+ // If this asm kills any FP registers (is the last use of them) we must
+ // explicitly emit pop instructions for them. Do this now after the asm has
+ // executed so that the ST(x) numbers are not off (which would happen if we
+ // did this inline with operand rewriting).
+ //
+ // Note: this might be a non-optimal pop sequence. We might be able to do
+ // better by trying to pop in stack order or something.
+ while (FPKills) {
+ unsigned FPReg = CountTrailingZeros_32(FPKills);
+ if (isLive(FPReg))
+ freeStackSlotAfter(InsertPt, FPReg);
+ FPKills &= ~(1U << FPReg);
+ }
+ // Don't delete the inline asm!
+ return;
+ }
+
+ case X86::WIN_FTOL_32:
+ case X86::WIN_FTOL_64: {
+ // Push the operand into ST0.
+ MachineOperand &Op = MI->getOperand(0);
+ assert(Op.isUse() && Op.isReg() &&
+ Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
+ unsigned FPReg = getFPReg(Op);
+ if (Op.isKill())
+ moveToTop(FPReg, I);
+ else
+ duplicateToTop(FPReg, FPReg, I);
+
+ // Emit the call. This will pop the operand.
+ BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("_ftol2")
+ .addReg(X86::ST0, RegState::ImplicitKill)
+ .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+ .addReg(X86::EDX, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+ --StackTop;
+
+ break;
+ }
+
+ case X86::RET:
+ case X86::RETI:
+ // If RET has an FP register use operand, pass the first one in ST(0) and
+ // the second one in ST(1).
+
+ // Find the register operands.
+ unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+ unsigned LiveMask = 0;
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ // FP Register uses must be kills unless there are two uses of the same
+ // register, in which case only one will be a kill.
+ assert(Op.isUse() &&
+ (Op.isKill() || // Marked kill.
+ getFPReg(Op) == FirstFPRegOp || // Second instance.
+ MI->killsRegister(Op.getReg())) && // Later use is marked kill.
+ "Ret only defs operands, and values aren't live beyond it");
+
+ if (FirstFPRegOp == ~0U)
+ FirstFPRegOp = getFPReg(Op);
+ else {
+ assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+ SecondFPRegOp = getFPReg(Op);
+ }
+ LiveMask |= (1 << getFPReg(Op));
+
+ // Remove the operand so that later passes don't see it.
+ MI->RemoveOperand(i);
+ --i, --e;
+ }
+
+ // We may have been carrying spurious live-ins, so make sure only the returned
+ // registers are left live.
+ adjustLiveRegs(LiveMask, MI);
+ if (!LiveMask) return; // Quick check to see if any are possible.
+
+ // There are only four possibilities here:
+ // 1) we are returning a single FP value. In this case, it has to be in
+ // ST(0) already, so just declare success by removing the value from the
+ // FP Stack.
+ if (SecondFPRegOp == ~0U) {
+ // Assert that the top of stack contains the right FP register.
+ assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+ "Top of stack not the right register for RET!");
+
+ // Ok, everything is good, mark the value as not being on the stack
+ // anymore so that our assertion about the stack being empty at end of
+ // block doesn't fire.
+ StackTop = 0;
+ return;
+ }
+
+ // Otherwise, we are returning two values:
+ // 2) If returning the same value for both, we only have one thing in the FP
+ // stack. Consider: RET FP1, FP1
+ if (StackTop == 1) {
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ "Stack misconfiguration for RET!");
+
+ // Duplicate the TOS so that we return it twice. Just pick some other FPx
+ // register to hold it.
+ unsigned NewReg = getScratchReg();
+ duplicateToTop(FirstFPRegOp, NewReg, MI);
+ FirstFPRegOp = NewReg;
+ }
+
+ /// Okay we know we have two different FPx operands now:
+ assert(StackTop == 2 && "Must have two values live!");
+
+ /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+ /// in ST(1). In this case, emit an fxch.
+ if (getStackEntry(0) == SecondFPRegOp) {
+ assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+ moveToTop(FirstFPRegOp, MI);
+ }
+
+ /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+ /// ST(1). Just remove both from our understanding of the stack and return.
+ assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+ assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+ StackTop = 0;
+ return;
+ }
+
+ I = MBB->erase(I); // Remove the pseudo instruction
+
+ // We want to leave I pointing to the previous instruction, but what if we
+ // just erased the first instruction?
+ if (I == MBB->begin()) {
+ DEBUG(dbgs() << "Inserting dummy KILL\n");
+ I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL));
+ } else
+ --I;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 0000000..42b4e73
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,1835 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+// FIXME: completely move here.
+extern cl::opt<bool> ForceStackAlign;
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineModuleInfo &MMI = MF.getMMI();
+ const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
+
+ return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ RegInfo->needsStackRealignment(MF) ||
+ MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken() || MF.hasMSInlineAsm() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ MMI.callsUnwindInit() || MMI.callsEHReturn());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::ADD64ri8;
+ return X86::ADD64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::ADD32ri8;
+ return X86::ADD32ri;
+ }
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+ return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const TargetRegisterInfo &TRI,
+ bool Is64Bit) {
+ const MachineFunction *MF = MBB.getParent();
+ const Function *F = MF->getFunction();
+ if (!F || MF->getMMI().callsEHReturn())
+ return 0;
+
+ static const uint16_t CallerSavedRegs32Bit[] = {
+ X86::EAX, X86::EDX, X86::ECX, 0
+ };
+
+ static const uint16_t CallerSavedRegs64Bit[] = {
+ X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
+ X86::R8, X86::R9, X86::R10, X86::R11, 0
+ };
+
+ unsigned Opc = MBBI->getOpcode();
+ switch (Opc) {
+ default: return 0;
+ case X86::RET:
+ case X86::RETI:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ SmallSet<uint16_t, 8> Uses;
+ for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MBBI->getOperand(i);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
+ for (; *CS; ++CS)
+ if (!Uses.count(*CS))
+ return *CS;
+ }
+ }
+
+ return 0;
+}
+
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, int64_t NumBytes,
+ bool Is64Bit, bool IsLP64, bool UseLEA,
+ const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+ unsigned Opc;
+ if (UseLEA)
+ Opc = getLEArOpcode(IsLP64);
+ else
+ Opc = isSub
+ ? getSUBriOpcode(IsLP64, Offset)
+ : getADDriOpcode(IsLP64, Offset);
+
+ uint64_t Chunk = (1LL << 31) - 1;
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ while (Offset) {
+ uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+ if (ThisVal == (Is64Bit ? 8 : 4)) {
+ // Use push / pop instead.
+ unsigned Reg = isSub
+ ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+ : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+ if (Reg) {
+ Opc = isSub
+ ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+ : (Is64Bit ? X86::POP64r : X86::POP32r);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+ .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+ if (isSub)
+ MI->setFlag(MachineInstr::FrameSetup);
+ Offset -= ThisVal;
+ continue;
+ }
+ }
+
+ MachineInstr *MI = NULL;
+
+ if (UseLEA) {
+ MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ StackPtr, false, isSub ? -ThisVal : ThisVal);
+ } else {
+ MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(ThisVal);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+
+ if (isSub)
+ MI->setFlag(MachineInstr::FrameSetup);
+
+ Offset -= ThisVal;
+ }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ if (MBBI == MBB.begin()) return;
+
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+ Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ }
+}
+
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ // FIXME: THIS ISN'T RUN!!!
+ return;
+
+ if (MBBI == MBB.end()) return;
+
+ MachineBasicBlock::iterator NI = llvm::next(MBBI);
+ if (NI == MBB.end()) return;
+
+ unsigned Opc = NI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ NI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes -= NI->getOperand(2).getImm();
+ MBB.erase(NI);
+ MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ NI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes += NI->getOperand(2).getImm();
+ MBB.erase(NI);
+ MBBI = NI;
+ }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
+/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
+/// SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr,
+ bool doMergeWithPrevious) {
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ int Offset = 0;
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+ Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ }
+
+ return Offset;
+}
+
+static bool isEAXLiveIn(MachineFunction &MF) {
+ for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+ EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+ unsigned Reg = II->first;
+
+ if (Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL)
+ return true;
+ }
+
+ return false;
+}
+
+void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
+ MCSymbol *Label,
+ unsigned FramePtr) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ if (CSI.empty()) return;
+
+ std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+ const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+ bool HasFP = hasFP(MF);
+
+ // Calculate amount of bytes used for return address storing.
+ int stackGrowth = -RegInfo->getSlotSize();
+
+ // FIXME: This is dirty hack. The code itself is pretty mess right now.
+ // It should be rewritten from scratch and generalized sometimes.
+
+ // Determine maximum offset (minimum due to stack growth).
+ int64_t MaxOffset = 0;
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I)
+ MaxOffset = std::min(MaxOffset,
+ MFI->getObjectOffset(I->getFrameIdx()));
+
+ // Calculate offsets.
+ int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+ Offset = MaxOffset - Offset + saveAreaOffset;
+
+ // Don't output a new machine move if we're re-saving the frame
+ // pointer. This happens when the PrologEpilogInserter has inserted an extra
+ // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
+ // generates one when frame pointers are used. If we generate a "machine
+ // move" for this extra "PUSH", the linker will lose track of the fact that
+ // the frame pointer should have the value of the first "PUSH" when it's
+ // trying to unwind.
+ //
+ // FIXME: This looks inelegant. It's possibly correct, but it's covering up
+ // another bug. I.e., one where we generate a prolog like this:
+ //
+ // pushl %ebp
+ // movl %esp, %ebp
+ // pushl %ebp
+ // pushl %esi
+ // ...
+ //
+ // The immediate re-push of EBP is unnecessary. At the least, it's an
+ // optimization bug. EBP can be used as a scratch register in certain
+ // cases, but probably not when we have a frame pointer.
+ if (HasFP && FramePtr == Reg)
+ continue;
+
+ MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+ MachineLocation CSSrc(Reg);
+ Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+ }
+}
+
+/// getCompactUnwindRegNum - Get the compact unwind number for a given
+/// register. The number corresponds to the enum lists in
+/// compact_unwind_encoding.h.
+static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
+ static const uint16_t CU32BitRegs[] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const uint16_t CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
+ for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+ if (*CURegs == Reg)
+ return Idx;
+
+ return -1;
+}
+
+// Number of registers that can be saved in a compact unwind encoding.
+#define CU_NUM_SAVED_REGS 6
+
+/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
+/// used with frameless stacks. It is passed the number of registers to be saved
+/// and an array of the registers saved.
+static uint32_t
+encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
+ unsigned RegCount, bool Is64Bit) {
+ // The saved registers are numbered from 1 to 6. In order to encode the order
+ // in which they were saved, we re-number them according to their place in the
+ // register order. The re-numbering is relative to the last re-numbered
+ // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
+ //
+ // Orig Re-Num
+ // ---- ------
+ // 6 6
+ // 2 2
+ // 4 3
+ // 5 3
+ //
+ for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
+ if (CUReg == -1) return ~0U;
+ SavedRegs[i] = CUReg;
+ }
+
+ // Reverse the list.
+ std::swap(SavedRegs[0], SavedRegs[5]);
+ std::swap(SavedRegs[1], SavedRegs[4]);
+ std::swap(SavedRegs[2], SavedRegs[3]);
+
+ uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+ for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) {
+ unsigned Countless = 0;
+ for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+ if (SavedRegs[j] < SavedRegs[i])
+ ++Countless;
+
+ RenumRegs[i] = SavedRegs[i] - Countless - 1;
+ }
+
+ // Take the renumbered values and encode them into a 10-bit number.
+ uint32_t permutationEncoding = 0;
+ switch (RegCount) {
+ case 6:
+ permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+ + 6 * RenumRegs[2] + 2 * RenumRegs[3]
+ + RenumRegs[4];
+ break;
+ case 5:
+ permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+ + 6 * RenumRegs[3] + 2 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 4:
+ permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3]
+ + 3 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 3:
+ permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 2:
+ permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 1:
+ permutationEncoding |= RenumRegs[5];
+ break;
+ }
+
+ assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+ "Invalid compact register encoding!");
+ return permutationEncoding;
+}
+
+/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
+/// compact encoding with a frame pointer.
+static uint32_t
+encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
+ bool Is64Bit) {
+ // Encode the registers in the order they were saved, 3-bits per register. The
+ // registers are numbered from 1 to CU_NUM_SAVED_REGS.
+ uint32_t RegEnc = 0;
+ for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) {
+ unsigned Reg = SavedRegs[I];
+ if (Reg == 0) continue;
+
+ int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
+ if (CURegNum == -1) return ~0U;
+
+ // Encode the 3-bit register number in order, skipping over 3-bits for each
+ // register.
+ RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+ }
+
+ assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!");
+ return RegEnc;
+}
+
+uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
+ const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned StackPtr = RegInfo->getStackRegister();
+
+ bool Is64Bit = STI.is64Bit();
+ bool HasFP = hasFP(MF);
+
+ unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 };
+ unsigned SavedRegIdx = 0;
+
+ unsigned OffsetSize = (Is64Bit ? 8 : 4);
+
+ unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
+ unsigned PushInstrSize = 1;
+ unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+ unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
+ unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
+
+ unsigned StackDivide = (Is64Bit ? 8 : 4);
+
+ unsigned InstrOffset = 0;
+ unsigned StackAdjust = 0;
+ unsigned StackSize = 0;
+
+ MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
+ bool ExpectEnd = false;
+ for (MachineBasicBlock::iterator
+ MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opc = MI.getOpcode();
+ if (Opc == X86::PROLOG_LABEL) continue;
+ if (!MI.getFlag(MachineInstr::FrameSetup)) break;
+
+ // We don't exect any more prolog instructions.
+ if (ExpectEnd) return CU::UNWIND_MODE_DWARF;
+
+ if (Opc == PushInstr) {
+ // If there are too many saved registers, we cannot use compact encoding.
+ if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
+
+ unsigned Reg = MI.getOperand(0).getReg();
+ if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
+ ExpectEnd = true;
+ continue;
+ }
+
+ SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
+ StackAdjust += OffsetSize;
+ InstrOffset += PushInstrSize;
+ } else if (Opc == MoveInstr) {
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ unsigned DstReg = MI.getOperand(0).getReg();
+
+ if (DstReg != FramePtr || SrcReg != StackPtr)
+ return CU::UNWIND_MODE_DWARF;
+
+ StackAdjust = 0;
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ SavedRegIdx = 0;
+ InstrOffset += MoveInstrSize;
+ } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
+ if (StackSize)
+ // We already have a stack size.
+ return CU::UNWIND_MODE_DWARF;
+
+ if (!MI.getOperand(0).isReg() ||
+ MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
+ MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
+ // We need this to be a stack adjustment pointer. Something like:
+ //
+ // %RSP<def> = SUB64ri8 %RSP, 48
+ return CU::UNWIND_MODE_DWARF;
+
+ StackSize = MI.getOperand(2).getImm() / StackDivide;
+ SubtractInstrIdx += InstrOffset;
+ ExpectEnd = true;
+ }
+ }
+
+ // Encode that we are using EBP/RBP as the frame pointer.
+ uint32_t CompactUnwindEncoding = 0;
+ StackAdjust /= StackDivide;
+ if (HasFP) {
+ if ((StackAdjust & 0xFF) != StackAdjust)
+ // Offset was too big for compact encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Get the encoding of the saved registers when we have a frame pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+ CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+ CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+ } else {
+ ++StackAdjust;
+ uint32_t TotalStackSize = StackAdjust + StackSize;
+ if ((TotalStackSize & 0xFF) == TotalStackSize) {
+ // Frameless stack with a small stack size.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+ // Encode the stack size.
+ CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
+ } else {
+ if ((StackAdjust & 0x7) != StackAdjust)
+ // The extra stack adjustments are too big for us to handle.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Frameless stack with an offset too large for us to encode compactly.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+ // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+ // instruction.
+ CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+ // Encode any extra stack stack adjustments (done via push instructions).
+ CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+ }
+
+ // Encode the number of registers saved.
+ CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+ // Get the encoding of the saved registers when we don't have a frame
+ // pointer.
+ uint32_t RegEnc =
+ encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
+ Is64Bit);
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ // Encode the register encoding.
+ CompactUnwindEncoding |=
+ RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+ }
+
+ return CompactUnwindEncoding;
+}
+
+/// usesTheStack - This function checks if any of the users of EFLAGS
+/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
+/// to use the stack, and if we don't adjust the stack we clobber the first
+/// frame index.
+/// See X86InstrInfo::copyPhysReg.
+static bool usesTheStack(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
+ re = MRI.reg_end(); ri != re; ++ri)
+ if (ri->isCopy())
+ return true;
+
+ return false;
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
+ MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *Fn = MF.getFunction();
+ const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ bool needsFrameMoves = MMI.hasDebugInfo() ||
+ Fn->needsUnwindTableEntry();
+ uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
+ uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
+ bool HasFP = hasFP(MF);
+ bool Is64Bit = STI.is64Bit();
+ bool IsLP64 = STI.isTarget64BitLP64();
+ bool IsWin64 = STI.isTargetWin64();
+ bool UseLEA = STI.useLeaForSP();
+ unsigned StackAlign = getStackAlignment();
+ unsigned SlotSize = RegInfo->getSlotSize();
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned StackPtr = RegInfo->getStackRegister();
+ unsigned BasePtr = RegInfo->getBaseRegister();
+ DebugLoc DL;
+
+ // If we're forcing a stack realignment we can't rely on just the frame
+ // info, we need to know the ABI stack alignment as well in case we
+ // have a call out. Otherwise just make sure we have some alignment - we'll
+ // go with the minimum SlotSize.
+ if (ForceStackAlign) {
+ if (MFI->hasCalls())
+ MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+ else if (MaxAlign < SlotSize)
+ MaxAlign = SlotSize;
+ }
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+ // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+ // function, and use up to 128 bytes of stack space, don't have a frame
+ // pointer, calls, or dynamic alloca then we do not need to adjust the
+ // stack pointer (we fit in the Red Zone). We also check that we don't
+ // push and pop from the stack.
+ if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoRedZone) &&
+ !RegInfo->needsStackRealignment(MF) &&
+ !MFI->hasVarSizedObjects() && // No dynamic alloca.
+ !MFI->adjustsStack() && // No calls.
+ !IsWin64 && // Win64 has no Red Zone
+ !usesTheStack(MF) && // Don't push and pop.
+ !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack
+ uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+ if (HasFP) MinSize += SlotSize;
+ StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+ MFI->setStackSize(StackSize);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)),
+ StackPtr)
+ .addReg(StackPtr)
+ .addImm(-TailCallReturnAddrDelta)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+
+ // Mapping for machine moves:
+ //
+ // DST: VirtualFP AND
+ // SRC: VirtualFP => DW_CFA_def_cfa_offset
+ // ELSE => DW_CFA_def_cfa
+ //
+ // SRC: VirtualFP AND
+ // DST: Register => DW_CFA_def_cfa_register
+ //
+ // ELSE
+ // OFFSET < 0 => DW_CFA_offset_extended_sf
+ // REG < 64 => DW_CFA_offset + Reg
+ // ELSE => DW_CFA_offset_extended
+
+ std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+ uint64_t NumBytes = 0;
+ int stackGrowth = -SlotSize;
+
+ if (HasFP) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ if (RegInfo->needsStackRealignment(MF)) {
+ // Callee-saved registers are pushed on stack before the stack
+ // is realigned.
+ FrameSize -= X86FI->getCalleeSavedFrameSize();
+ NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+ } else {
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ // Get the offset of the stack slot for the EBP register, which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ MFI->setOffsetAdjustment(-NumBytes);
+
+ // Save EBP/RBP into the appropriate stack slot.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(FramePtr, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (needsFrameMoves) {
+ // Mark the place where EBP/RBP was saved.
+ MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+ .addSym(FrameLabel);
+
+ // Define the current CFA rule to use the provided offset.
+ if (StackSize) {
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
+ Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+ } else {
+ MachineLocation SPDst(StackPtr);
+ MachineLocation SPSrc(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+ }
+
+ // Change the rule for the FramePtr to be an "offset" rule.
+ MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth);
+ MachineLocation FPSrc(FramePtr);
+ Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+ }
+
+ // Update EBP with the new base value.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (needsFrameMoves) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+ .addSym(FrameLabel);
+
+ // Define the current CFA to use the EBP/RBP register.
+ MachineLocation FPDst(FramePtr);
+ MachineLocation FPSrc(MachineLocation::VirtualFP);
+ Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+ }
+
+ // Mark the FramePtr as live-in in every block except the entry.
+ for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+ I != E; ++I)
+ I->addLiveIn(FramePtr);
+ } else {
+ NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ // Skip the callee-saved push instructions.
+ bool PushedRegs = false;
+ int StackOffset = 2 * stackGrowth;
+
+ while (MBBI != MBB.end() &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r)) {
+ PushedRegs = true;
+ MBBI->setFlag(MachineInstr::FrameSetup);
+ ++MBBI;
+
+ if (!HasFP && needsFrameMoves) {
+ // Mark callee-saved push instruction.
+ MCSymbol *Label = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
+
+ // Define the current CFA rule to use the provided offset.
+ unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
+ MachineLocation SPDst(Ptr);
+ MachineLocation SPSrc(Ptr, StackOffset);
+ Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ StackOffset += stackGrowth;
+ }
+ }
+
+ // Realign stack after we pushed callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+
+ // NOTE: We push the registers before realigning the stack, so
+ // vector callee-saved (xmm) registers may be saved w/o proper
+ // alignment in this way. However, currently these regs are saved in
+ // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
+ // this shouldn't be a problem.
+ if (RegInfo->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+ .addReg(StackPtr)
+ .addImm(-MaxAlign)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+
+ // If there is an SUB32ri of ESP immediately before this instruction, merge
+ // the two. This can be the case when tail call elimination is enabled and
+ // the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately after this
+ // instruction, merge the two instructions.
+ mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+ // Adjust stack pointer: ESP -= numbytes.
+
+ // Windows and cygwin/mingw require a prologue helper routine when allocating
+ // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
+ // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
+ // stack and adjust the stack pointer in one go. The 64-bit version of
+ // __chkstk is only responsible for probing the stack. The 64-bit prologue is
+ // responsible for adjusting the stack pointer. Touching the stack at 4K
+ // increments is necessary to ensure that the guard pages used by the OS
+ // virtual memory manager are allocated in correct sequence.
+ if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) {
+ const char *StackProbeSymbol;
+ bool isSPUpdateNeeded = false;
+
+ if (Is64Bit) {
+ if (STI.isTargetCygMing())
+ StackProbeSymbol = "___chkstk";
+ else {
+ StackProbeSymbol = "__chkstk";
+ isSPUpdateNeeded = true;
+ }
+ } else if (STI.isTargetCygMing())
+ StackProbeSymbol = "_alloca";
+ else
+ StackProbeSymbol = "_chkstk";
+
+ // Check whether EAX is livein for this function.
+ bool isEAXAlive = isEAXLiveIn(MF);
+
+ if (isEAXAlive) {
+ // Sanity check that EAX is not livein for this function.
+ // It should not be, so throw an assert.
+ assert(!Is64Bit && "EAX is livein in x64 case!");
+
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (Is64Bit) {
+ // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+ // Function prologue is responsible for adjusting the stack pointer.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+ // We'll also use 4 already allocated bytes for EAX.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
+ .addExternalSymbol(StackProbeSymbol)
+ .addReg(StackPtr, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // MSVC x64's __chkstk needs to adjust %rsp.
+ // FIXME: %rax preserves the offset and should be available.
+ if (isSPUpdateNeeded)
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
+ UseLEA, TII, *RegInfo);
+
+ if (isEAXAlive) {
+ // Restore EAX
+ MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+ X86::EAX),
+ StackPtr, false, NumBytes - 4);
+ MI->setFlag(MachineInstr::FrameSetup);
+ MBB.insert(MBBI, MI);
+ }
+ } else if (NumBytes)
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
+ UseLEA, TII, *RegInfo);
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (RegInfo->hasBasePointer(MF)) {
+ // Update the frame pointer with the current stack pointer.
+ unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
+ // Mark end of stack pointer adjustment.
+ MCSymbol *Label = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+ .addSym(Label);
+
+ if (!HasFP && NumBytes) {
+ // Define the current CFA rule to use the provided offset.
+ if (StackSize) {
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP,
+ -StackSize + stackGrowth);
+ Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ } else {
+ MachineLocation SPDst(StackPtr);
+ MachineLocation SPSrc(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ }
+ }
+
+ // Emit DWARF info specifying the offsets of the callee-saved registers.
+ if (PushedRegs)
+ emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
+ }
+
+ // Darwin 10.7 and greater has support for compact unwind encoding.
+ if (STI.getTargetTriple().isMacOSX() &&
+ !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
+ MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI != MBB.end() && "Returning block has no instructions");
+ unsigned RetOpcode = MBBI->getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+ bool Is64Bit = STI.is64Bit();
+ bool IsLP64 = STI.isTarget64BitLP64();
+ bool UseLEA = STI.useLeaForSP();
+ unsigned StackAlign = getStackAlignment();
+ unsigned SlotSize = RegInfo->getSlotSize();
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned StackPtr = RegInfo->getStackRegister();
+
+ switch (RetOpcode) {
+ default:
+ llvm_unreachable("Can only insert epilog into returning blocks");
+ case X86::RET:
+ case X86::RETI:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64:
+ break; // These are ok
+ }
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI->getStackSize();
+ uint64_t MaxAlign = MFI->getMaxAlignment();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ // If we're forcing a stack realignment we can't rely on just the frame
+ // info, we need to know the ABI stack alignment as well in case we
+ // have a call out. Otherwise just make sure we have some alignment - we'll
+ // go with the minimum.
+ if (ForceStackAlign) {
+ if (MFI->hasCalls())
+ MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+ else
+ MaxAlign = MaxAlign ? MaxAlign : 4;
+ }
+
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ if (RegInfo->needsStackRealignment(MF)) {
+ // Callee-saved registers were pushed on stack before the stack
+ // was realigned.
+ FrameSize -= CSSize;
+ NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+ } else {
+ NumBytes = FrameSize - CSSize;
+ }
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+
+ if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
+ !PI->isTerminator())
+ break;
+
+ --MBBI;
+ }
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
+
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI->hasVarSizedObjects())
+ mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+ // If dynamic alloca is used, then reset esp to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned.
+ if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+ if (RegInfo->needsStackRealignment(MF))
+ MBBI = FirstCSPop;
+ if (CSSize != 0) {
+ unsigned Opc = getLEArOpcode(IsLP64);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, -CSSize);
+ } else {
+ unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(FramePtr);
+ }
+ } else if (NumBytes) {
+ // Adjust stack pointer back: ESP += numbytes.
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA,
+ TII, *RegInfo);
+ }
+
+ // We're returning from function via eh_return.
+ if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+ StackPtr).addReg(DestAddr.getReg());
+ } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+ RetOpcode == X86::TCRETURNmi ||
+ RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
+ RetOpcode == X86::TCRETURNmi64) {
+ bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
+ // Tail call return: adjust the stack pointer and jump to callee.
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+ // Incoporate the retaddr area.
+ Offset = StackAdj-MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+
+ if (Offset) {
+ // Check for possible merge with preceding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64,
+ UseLEA, TII, *RegInfo);
+ }
+
+ // Jump to label or value in register.
+ if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
+ ? X86::TAILJMPd : X86::TAILJMPd64));
+ if (JumpTarget.isGlobal())
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+ } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi)
+ ? X86::TAILJMPm : X86::TAILJMPm64));
+ for (unsigned i = 0; i != 5; ++i)
+ MIB.addOperand(MBBI->getOperand(i));
+ } else if (RetOpcode == X86::TCRETURNri64) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
+ addReg(JumpTarget.getReg(), RegState::Kill);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
+ addReg(JumpTarget.getReg(), RegState::Kill);
+ }
+
+ MachineInstr *NewMI = prior(MBBI);
+ NewMI->copyImplicitOps(MF, MBBI);
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+ } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+ (X86FI->getTCReturnAddrDelta() < 0)) {
+ // Add the return addr area delta back since we are not tail calling.
+ int delta = -1*X86FI->getTCReturnAddrDelta();
+ MBBI = MBB.getLastNonDebugInstr();
+
+ // Check for possible merge with preceding ADD instruction.
+ delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII,
+ *RegInfo);
+ }
+}
+
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+ uint64_t StackSize = MFI->getStackSize();
+
+ if (RegInfo->hasBasePointer(MF)) {
+ assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + RegInfo->getSlotSize();
+ } else {
+ assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ } else if (RegInfo->needsStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + RegInfo->getSlotSize();
+ } else {
+ assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ // FIXME: Support tail calls
+ } else {
+ if (!hasFP(MF))
+ return Offset + StackSize;
+
+ // Skip the saved EBP.
+ Offset += RegInfo->getSlotSize();
+
+ // Skip the RETADDR move area
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ Offset -= TailCallReturnAddrDelta;
+ }
+
+ return Offset;
+}
+
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (RegInfo->hasBasePointer(MF))
+ FrameReg = RegInfo->getBaseRegister();
+ else if (RegInfo->needsStackRealignment(MF))
+ FrameReg = RegInfo->getStackRegister();
+ else
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return getFrameIndexOffset(MF, FI);
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ MachineFunction &MF = *MBB.getParent();
+
+ unsigned SlotSize = STI.is64Bit() ? 8 : 4;
+ unsigned FPReg = TRI->getFrameRegister(MF);
+ unsigned CalleeFrameSize = 0;
+
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+ // Push GPRs. It increases frame size.
+ unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (!X86::GR64RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg))
+ continue;
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ if (Reg == FPReg)
+ // X86RegisterInfo::emitPrologue will handle spilling of frame register.
+ continue;
+ CalleeFrameSize += SlotSize;
+ BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
+
+ // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+ // It can be done by spilling XMMs to stack frame.
+ // Note that only Win64 ABI might spill XMMs.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg))
+ continue;
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
+ RC, TRI);
+ }
+
+ return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+ // Reload XMMs from stack frame.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg))
+ continue;
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+ RC, TRI);
+ }
+
+ // POP GPRs.
+ unsigned FPReg = TRI->getFrameRegister(MF);
+ unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (!X86::GR64RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg))
+ continue;
+ if (Reg == FPReg)
+ // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
+ continue;
+ BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
+ }
+ return true;
+}
+
+void
+X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+ unsigned SlotSize = RegInfo->getSlotSize();
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+ (-1U*SlotSize)+TailCallReturnAddrDelta, true);
+ }
+
+ if (hasFP(MF)) {
+ assert((TailCallReturnAddrDelta <= 0) &&
+ "The Delta should always be zero or negative");
+ const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+
+ // Create a frame entry for the EBP register that must be saved.
+ int FrameIdx = MFI->CreateFixedObject(SlotSize,
+ -(int)SlotSize +
+ TFI.getOffsetOfLocalArea() +
+ TailCallReturnAddrDelta,
+ true);
+ assert(FrameIdx == MFI->getObjectIndexBegin() &&
+ "Slot for EBP register must be last in order to be found!");
+ (void)FrameIdx;
+ }
+
+ // Spill the BasePtr if it's used.
+ if (RegInfo->hasBasePointer(MF))
+ MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+ const Function *F = MF->getFunction();
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; I++) {
+ if (I->hasNestAttr())
+ return true;
+ }
+ return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+ CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+ // Erlang stuff.
+ if (CallingConvention == CallingConv::HiPE) {
+ if (Is64Bit)
+ return Primary ? X86::R14 : X86::R13;
+ else
+ return Primary ? X86::EBX : X86::EDI;
+ }
+
+ if (Is64Bit)
+ return Primary ? X86::R11 : X86::R12;
+
+ bool IsNested = HasNestArgument(&MF);
+
+ if (CallingConvention == CallingConv::X86_FastCall ||
+ CallingConvention == CallingConv::Fast) {
+ if (IsNested)
+ report_fatal_error("Segmented stacks does not support fastcall with "
+ "nested function.");
+ return Primary ? X86::EAX : X86::ECX;
+ }
+ if (IsNested)
+ return Primary ? X86::EDX : X86::EAX;
+ return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void
+X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+ MachineBasicBlock &prologueMBB = MF.front();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ uint64_t StackSize;
+ bool Is64Bit = STI.is64Bit();
+ unsigned TlsReg, TlsOffset;
+ DebugLoc DL;
+
+ unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "Scratch register is live-in");
+
+ if (MF.getFunction()->isVarArg())
+ report_fatal_error("Segmented stacks do not support vararg functions.");
+ if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
+ !STI.isTargetWin32() && !STI.isTargetFreeBSD())
+ report_fatal_error("Segmented stacks not supported on this platform.");
+
+ MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ bool IsNested = false;
+
+ // We need to know if the function has a nest argument only in 64 bit mode.
+ if (Is64Bit)
+ IsNested = HasNestArgument(&MF);
+
+ // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+ // allocMBB needs to be last (terminating) instruction.
+
+ for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+ e = prologueMBB.livein_end(); i != e; i++) {
+ allocMBB->addLiveIn(*i);
+ checkMBB->addLiveIn(*i);
+ }
+
+ if (IsNested)
+ allocMBB->addLiveIn(X86::R10);
+
+ MF.push_front(allocMBB);
+ MF.push_front(checkMBB);
+
+ // Eventually StackSize will be calculated by a link-time pass; which will
+ // also decide whether checking code needs to be injected into this particular
+ // prologue.
+ StackSize = MFI->getStackSize();
+
+ // When the frame size is less than 256 we just compare the stack
+ // boundary directly to the value of the stack pointer, per gcc.
+ bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+ // Read the limit off the current stacklet off the stack_guard location.
+ if (Is64Bit) {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x70;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+ } else if (STI.isTargetFreeBSD()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x18;
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = X86::RSP;
+ else
+ BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x30;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x48 + 90*4;
+ } else if (STI.isTargetWin32()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x14; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetFreeBSD()) {
+ report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ if (STI.isTargetLinux() || STI.isTargetWin32()) {
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else if (STI.isTargetDarwin()) {
+
+ // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+ unsigned ScratchReg2;
+ bool SaveScratch2;
+ if (CompareStackPointer) {
+ // The primary scratch register is available for holding the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
+ SaveScratch2 = false;
+ } else {
+ // Need to use a second register to hold the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
+
+ // Unfortunately, with fastcc the second scratch register may hold an arg
+ SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+ }
+
+ // If Scratch2 is live-in then it needs to be saved
+ assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+ "Scratch register is live-in and not saved");
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+ .addReg(ScratchReg2, RegState::Kill);
+
+ BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+ .addImm(TlsOffset);
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+ .addReg(ScratchReg)
+ .addReg(ScratchReg2).addImm(1).addReg(0)
+ .addImm(0)
+ .addReg(TlsReg);
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+ }
+ }
+
+ // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+ // It jumps to normal execution of the function body.
+ BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB);
+
+ // On 32 bit we first push the arguments size and then the frame size. On 64
+ // bit, we pass the stack frame size in r10 and the argument size in r11.
+ if (Is64Bit) {
+ // Functions with nested arguments use R10, so it needs to be saved across
+ // the call to _morestack
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10);
+
+ BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10)
+ .addImm(StackSize);
+ BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11)
+ .addImm(X86FI->getArgumentStackSize());
+ MF.getRegInfo().setPhysRegUsed(X86::R10);
+ MF.getRegInfo().setPhysRegUsed(X86::R11);
+ } else {
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(X86FI->getArgumentStackSize());
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(StackSize);
+ }
+
+ // __morestack is in libgcc
+ if (Is64Bit)
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack");
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack");
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+ allocMBB->addSuccessor(&prologueMBB);
+
+ checkMBB->addSuccessor(allocMBB);
+ checkMBB->addSuccessor(&prologueMBB);
+
+#ifdef XDEBUG
+ MF.verify();
+#endif
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+/// ...
+/// IncStack:
+/// call inc_stack # doubles the stack space
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+ const bool Is64Bit = STI.is64Bit();
+ DebugLoc DL;
+ // HiPE-specific values
+ const unsigned HipeLeafWords = 24;
+ const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+ const unsigned Guaranteed = HipeLeafWords * SlotSize;
+ unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+ MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+ unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+ assert(STI.isTargetLinux() &&
+ "HiPE prologue is only supported on Linux operating systems.");
+
+ // Compute the largest caller's frame that is needed to fit the callees'
+ // frames. This 'MaxStack' is computed from:
+ //
+ // a) the fixed frame size, which is the space needed for all spilled temps,
+ // b) outgoing on-stack parameter areas, and
+ // c) the minimum stack space this function needs to make available for the
+ // functions it calls (a tunable ABI property).
+ if (MFI->hasCalls()) {
+ unsigned MoreStackForCalls = 0;
+
+ for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+ MBBI != MBBE; ++MBBI)
+ for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+ MI != ME; ++MI) {
+ if (!MI->isCall())
+ continue;
+
+ // Get callee operand.
+ const MachineOperand &MO = MI->getOperand(0);
+
+ // Only take account of global function calls (no closures etc.).
+ if (!MO.isGlobal())
+ continue;
+
+ const Function *F = dyn_cast<Function>(MO.getGlobal());
+ if (!F)
+ continue;
+
+ // Do not update 'MaxStack' for primitive and built-in functions
+ // (encoded with names either starting with "erlang."/"bif_" or not
+ // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+ // "_", such as the BIF "suspend_0") as they are executed on another
+ // stack.
+ if (F->getName().find("erlang.") != StringRef::npos ||
+ F->getName().find("bif_") != StringRef::npos ||
+ F->getName().find_first_of("._") == StringRef::npos)
+ continue;
+
+ unsigned CalleeStkArity =
+ F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+ if (HipeLeafWords - 1 > CalleeStkArity)
+ MoreStackForCalls = std::max(MoreStackForCalls,
+ (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+ }
+ MaxStack += MoreStackForCalls;
+ }
+
+ // If the stack frame needed is larger than the guaranteed then runtime checks
+ // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+ if (MaxStack > Guaranteed) {
+ MachineBasicBlock &prologueMBB = MF.front();
+ MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+ for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
+ E = prologueMBB.livein_end(); I != E; I++) {
+ stackCheckMBB->addLiveIn(*I);
+ incStackMBB->addLiveIn(*I);
+ }
+
+ MF.push_front(incStackMBB);
+ MF.push_front(stackCheckMBB);
+
+ unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+ unsigned LEAop, CMPop, CALLop;
+ if (Is64Bit) {
+ SPReg = X86::RSP;
+ PReg = X86::RBP;
+ LEAop = X86::LEA64r;
+ CMPop = X86::CMP64rm;
+ CALLop = X86::CALL64pcrel32;
+ SPLimitOffset = 0x90;
+ } else {
+ SPReg = X86::ESP;
+ PReg = X86::EBP;
+ LEAop = X86::LEA32r;
+ CMPop = X86::CMP32rm;
+ CALLop = X86::CALLpcrel32;
+ SPLimitOffset = 0x4c;
+ }
+
+ ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "HiPE prologue scratch register is live-in");
+
+ // Create new MBB for StackCheck:
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ // SPLimitOffset is in a fixed heap location (pointed by BP).
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB);
+
+ // Create new MBB for IncStack:
+ BuildMI(incStackMBB, DL, TII.get(CALLop)).
+ addExternalSymbol("inc_stack_0");
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB);
+
+ stackCheckMBB->addSuccessor(&prologueMBB, 99);
+ stackCheckMBB->addSuccessor(incStackMBB, 1);
+ incStackMBB->addSuccessor(&prologueMBB, 99);
+ incStackMBB->addSuccessor(incStackMBB, 1);
+ }
+#ifdef XDEBUG
+ MF.verify();
+#endif
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+ unsigned StackPtr = RegInfo.getStackRegister();
+ bool reseveCallFrame = hasReservedCallFrame(MF);
+ int Opcode = I->getOpcode();
+ bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ bool IsLP64 = STI.isTarget64BitLP64();
+ DebugLoc DL = I->getDebugLoc();
+ uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+ uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+ I = MBB.erase(I);
+
+ if (!reseveCallFrame) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+ // TODO: consider using push / pop instead of sub + store / add
+ if (Amount == 0)
+ return;
+
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+ Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+ MachineInstr *New = 0;
+ if (Opcode == TII.getCallFrameSetupOpcode()) {
+ New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
+ StackPtr)
+ .addReg(StackPtr)
+ .addImm(Amount);
+ } else {
+ assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+ // Factor out the amount the callee already popped.
+ Amount -= CalleeAmt;
+
+ if (Amount) {
+ unsigned Opc = getADDriOpcode(IsLP64, Amount);
+ New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ }
+ }
+
+ if (New) {
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // Replace the pseudo instruction with a new instruction.
+ MBB.insert(I, New);
+ }
+
+ return;
+ }
+
+ if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+ MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(CalleeAmt);
+
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // We are not tracking the stack pointer adjustment by the callee, so make
+ // sure we restore the stack pointer immediately after the call, there may
+ // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+ MachineBasicBlock::iterator B = MBB.begin();
+ while (I != B && !llvm::prior(I)->isCall())
+ --I;
+ MBB.insert(I, New);
+ }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 0000000..6e309d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,103 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_FRAMELOWERING_H
+#define X86_FRAMELOWERING_H
+
+#include "X86Subtarget.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+namespace CU {
+
+ /// Compact unwind encoding values.
+ enum CompactUnwindEncodings {
+ /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+ /// the return address, then [RE]SP is moved to [RE]BP.
+ UNWIND_MODE_BP_FRAME = 0x01000000,
+
+ /// A frameless function with a small constant stack size.
+ UNWIND_MODE_STACK_IMMD = 0x02000000,
+
+ /// A frameless function with a large constant stack size.
+ UNWIND_MODE_STACK_IND = 0x03000000,
+
+ /// No compact unwind encoding is available.
+ UNWIND_MODE_DWARF = 0x04000000,
+
+ /// Mask for encoding the frame registers.
+ UNWIND_BP_FRAME_REGISTERS = 0x00007FFF,
+
+ /// Mask for encoding the frameless registers.
+ UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+ };
+
+} // end CU namespace
+
+class MCSymbol;
+class X86TargetMachine;
+
+class X86FrameLowering : public TargetFrameLowering {
+ const X86TargetMachine &TM;
+ const X86Subtarget &STI;
+public:
+ explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti)
+ : TargetFrameLowering(StackGrowsDown,
+ sti.getStackAlignment(),
+ (sti.is64Bit() ? -8 : -4)),
+ TM(tm), STI(sti) {
+ }
+
+ void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label,
+ unsigned FramePtr) const;
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF) const;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+ void adjustForSegmentedStacks(MachineFunction &MF) const;
+
+ void adjustForHiPEPrologue(MachineFunction &MF) const;
+
+ void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS = NULL) const;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const;
+
+ bool hasFP(const MachineFunction &MF) const;
+ bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+ int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const;
+ uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..968b358
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,2720 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+ /// SDValue's instead of register numbers for the leaves of the matched
+ /// tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ // This is really a union, discriminated by BaseType!
+ SDValue Base_Reg;
+ int Base_FrameIndex;
+
+ unsigned Scale;
+ SDValue IndexReg;
+ int32_t Disp;
+ SDValue Segment;
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ int JT;
+ unsigned Align; // CP alignment.
+ unsigned char SymbolFlags; // X86II::MO_*
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+ Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
+ SymbolFlags(X86II::MO_NO_FLAG) {
+ }
+
+ bool hasSymbolicDisplacement() const {
+ return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+ }
+
+ bool hasBaseOrIndexReg() const {
+ return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+ }
+
+ /// isRIPRelative - Return true if this addressing mode is already RIP
+ /// relative.
+ bool isRIPRelative() const {
+ if (BaseType != RegBase) return false;
+ if (RegisterSDNode *RegNode =
+ dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+ return RegNode->getReg() == X86::RIP;
+ return false;
+ }
+
+ void setBaseReg(SDValue Reg) {
+ BaseType = RegBase;
+ Base_Reg = Reg;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ dbgs() << "X86ISelAddressMode " << this << '\n';
+ dbgs() << "Base_Reg ";
+ if (Base_Reg.getNode() != 0)
+ Base_Reg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+ << " Scale" << Scale << '\n'
+ << "IndexReg ";
+ if (IndexReg.getNode() != 0)
+ IndexReg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Disp " << Disp << '\n'
+ << "GV ";
+ if (GV)
+ GV->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " CP ";
+ if (CP)
+ CP->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << '\n'
+ << "ES ";
+ if (ES)
+ dbgs() << ES;
+ else
+ dbgs() << "nul";
+ dbgs() << " JT" << JT << " Align" << Align << '\n';
+ }
+#endif
+ };
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86 specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class X86DAGToDAGISel : public SelectionDAGISel {
+ /// X86Lowering - This object fully describes how to lower LLVM code to an
+ /// X86-specific SelectionDAG.
+ const X86TargetLowering &X86Lowering;
+
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// OptForSize - If true, selector should try to optimize for code size
+ /// instead of performance.
+ bool OptForSize;
+
+ public:
+ explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel),
+ X86Lowering(*tm.getTargetLowering()),
+ Subtarget(&tm.getSubtarget<X86Subtarget>()),
+ OptForSize(false) {}
+
+ virtual const char *getPassName() const {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ virtual void EmitFunctionEntryCode();
+
+ virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
+
+ virtual void PreprocessISelDAG();
+
+ inline bool immSext8(SDNode *N) const {
+ return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+ }
+
+ // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // sign extended field.
+ inline bool i64immSExt32(SDNode *N) const {
+ uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+ return (int64_t)v == (int32_t)v;
+ }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ SDNode *Select(SDNode *N);
+ SDNode *SelectGather(SDNode *N, unsigned Opc);
+ SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+ SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
+
+ bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+ bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+ bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth);
+ bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool SelectLEAAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment,
+ SDValue &NodeWithChain);
+
+ bool TryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ char ConstraintCode,
+ std::vector<SDValue> &OutOps);
+
+ void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+ CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+ AM.Base_Reg;
+ Scale = getI8Imm(AM.Scale);
+ Index = AM.IndexReg;
+ // These are 32-bit even in 64-bit mode since RIP relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(),
+ MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+ AM.Align, AM.Disp, AM.SymbolFlags);
+ else if (AM.ES) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+ } else if (AM.JT != -1) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+ } else if (AM.BlockAddr)
+ Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ }
+
+ /// getI8Imm - Return a target constant with the specified value, of type
+ /// i8.
+ inline SDValue getI8Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i8);
+ }
+
+ /// getI32Imm - Return a target constant with the specified value, of type
+ /// i32.
+ inline SDValue getI32Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
+ }
+
+ /// getGlobalBaseReg - Return an SDNode that returns the value of
+ /// the global base register. Output instructions required to
+ /// initialize the global base register, if necessary.
+ ///
+ SDNode *getGlobalBaseReg();
+
+ /// getTargetMachine - Return a reference to the TargetMachine, casted
+ /// to the target-specific type.
+ const X86TargetMachine &getTargetMachine() const {
+ return static_cast<const X86TargetMachine &>(TM);
+ }
+
+ /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+ /// to the target-specific type.
+ const X86InstrInfo *getInstrInfo() const {
+ return getTargetMachine().getInstrInfo();
+ }
+ };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+ if (OptLevel == CodeGenOpt::None) return false;
+
+ if (!N.hasOneUse())
+ return false;
+
+ if (N.getOpcode() != ISD::LOAD)
+ return true;
+
+ // If N is a load, do additional profitability checks.
+ if (U == Root) {
+ switch (U->getOpcode()) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::AND:
+ case X86ISD::XOR:
+ case X86ISD::OR:
+ case ISD::ADD:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue Op1 = U->getOperand(1);
+
+ // If the other operand is a 8-bit immediate we should fold the immediate
+ // instead. This reduces code size.
+ // e.g.
+ // movl 4(%esp), %eax
+ // addl $4, %eax
+ // vs.
+ // movl $4, %eax
+ // addl 4(%esp), %eax
+ // The former is 2 bytes shorter. In case where the increment is 1, then
+ // the saving can be 4 bytes (by using incl %eax).
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (Imm->getAPIntValue().isSignedIntN(8))
+ return false;
+
+ // If the other operand is a TLS address, we should fold it instead.
+ // This produces
+ // movl %gs:0, %eax
+ // leal i@NTPOFF(%eax), %eax
+ // instead of
+ // movl $i@NTPOFF, %eax
+ // addl %gs:0, %eax
+ // if the block also has an access to a second TLS address this will save
+ // a load.
+ // FIXME: This is probably also true for non TLS addresses.
+ if (Op1.getOpcode() == X86ISD::Wrapper) {
+ SDValue Val = Op1.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Call, SDValue OrigChain) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Chain = OrigChain.getOperand(0);
+ if (Chain.getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else {
+ assert(Chain.getOpcode() == ISD::TokenFactor &&
+ "Unexpected chain operand");
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+ if (Chain.getOperand(i).getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(Chain.getOperand(i));
+ SDValue NewChain =
+ CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+ MVT::Other, &Ops[0], Ops.size());
+ Ops.clear();
+ Ops.push_back(NewChain);
+ }
+ for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
+ Ops.push_back(OrigChain.getOperand(i));
+ CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
+ CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+ Load.getOperand(1), Load.getOperand(2));
+
+ unsigned NumOps = Call.getNode()->getNumOperands();
+ Ops.clear();
+ Ops.push_back(SDValue(Load.getNode(), 1));
+ for (unsigned i = 1, e = NumOps; i != e; ++i)
+ Ops.push_back(Call.getOperand(i));
+ CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+ // The transformation is somewhat dangerous if the call's chain was glued to
+ // the call. After MoveBelowOrigChain the load is moved between the call and
+ // the chain, this can create a cycle if the load is not folded. So it is
+ // *really* important that we are sure the load will be folded.
+ if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+ return false;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+ if (!LD ||
+ LD->isVolatile() ||
+ LD->getAddressingMode() != ISD::UNINDEXED ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ // Now let's find the callseq_start.
+ while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+ if (!Chain.hasOneUse())
+ return false;
+ Chain = Chain.getOperand(0);
+ }
+
+ if (!Chain.getNumOperands())
+ return false;
+ // Since we are not checking for AA here, conservatively abort if the chain
+ // writes to memory. It's not safe to move the callee (a load) across a store.
+ if (isa<MemSDNode>(Chain.getNode()) &&
+ cast<MemSDNode>(Chain.getNode())->writeMem())
+ return false;
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+ Callee.getValue(1).hasOneUse())
+ return true;
+ return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+ // OptForSize is used in pattern predicates that isel is matching.
+ OptForSize = MF->getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+
+ if (OptLevel != CodeGenOpt::None &&
+ // Only does this when target favors doesn't favor register indirect
+ // call.
+ ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+ (N->getOpcode() == X86ISD::TC_RETURN &&
+ // Only does this if load can be folded into TC_RETURN.
+ (Subtarget->is64Bit() ||
+ getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
+ /// Also try moving call address load from outside callseq_start to just
+ /// before the call to allow it to be folded.
+ ///
+ /// [Load chain]
+ /// ^
+ /// |
+ /// [Load]
+ /// ^ ^
+ /// | |
+ /// / \--
+ /// / |
+ ///[CALLSEQ_START] |
+ /// ^ |
+ /// | |
+ /// [LOAD/C2Reg] |
+ /// | |
+ /// \ /
+ /// \ /
+ /// [CALL]
+ bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+ SDValue Chain = N->getOperand(0);
+ SDValue Load = N->getOperand(1);
+ if (!isCalleeLoad(Load, Chain, HasCallSeq))
+ continue;
+ MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+ ++NumLoadMoved;
+ continue;
+ }
+
+ // Lower fpround and fpextend nodes that target the FP stack to be store and
+ // load to the stack. This is a gross hack. We would like to simply mark
+ // these as being illegal, but when we do that, legalize produces these when
+ // it expands calls, then expands these in the same legalize pass. We would
+ // like dag combine to be able to hack on these between the call expansion
+ // and the node legalization. As such this pass basically does "really
+ // late" legalization of these inline with the X86 isel pass.
+ // FIXME: This should only happen when not compiled with -O0.
+ if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+ continue;
+
+ EVT SrcVT = N->getOperand(0).getValueType();
+ EVT DstVT = N->getValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ EVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ DebugLoc dl = N->getDebugLoc();
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+ SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+ N->getOperand(0),
+ MemTmp, MachinePointerInfo(), MemVT,
+ false, false, 0);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(),
+ MemVT, false, false, 0);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+}
+
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+ MachineFrameInfo *MFI) {
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ if (Subtarget->isTargetCygMing()) {
+ unsigned CallOp =
+ Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+ BuildMI(BB, DebugLoc(),
+ TII->get(CallOp)).addExternalSymbol("__main");
+ }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+ // If this is main, emit special code for main.
+ if (const Function *Fn = MF->getFunction())
+ if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+ EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo());
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+ // On 64-bit platforms, we can run into an issue where a frame index
+ // includes a displacement that, when added to the explicit displacement,
+ // will overflow the displacement field. Assuming that the frame index
+ // displacement fits into a 31-bit integer (which is only slightly more
+ // aggressive than the current fundamental assumption that it fits into
+ // a 32-bit integer), a 31-bit disp should always be safe.
+ return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
+ X86ISelAddressMode &AM) {
+ int64_t Val = AM.Disp + Offset;
+ CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit()) {
+ if (!X86::isOffsetSuitableForCodeModel(Val, M,
+ AM.hasSymbolicDisplacement()))
+ return true;
+ // In addition to the checks required for a register base, check that
+ // we do not try to use an unsafe Disp with a frame index.
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+ !isDispSafeForFrameIndex(Val))
+ return true;
+ }
+ AM.Disp = Val;
+ return false;
+
+}
+
+bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+ SDValue Address = N->getOperand(1);
+
+ // load gs:0 -> GS segment register.
+ // load fs:0 -> FS segment register.
+ //
+ // This optimization is valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address.
+ // For more information see http://people.redhat.com/drepper/tls.pdf
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+ if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
+ Subtarget->isTargetLinux())
+ switch (N->getPointerInfo().getAddrSpace()) {
+ case 256:
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ return false;
+ case 257:
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ return false;
+ }
+
+ return true;
+}
+
+/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
+/// into an addressing mode. These wrap things that will resolve down into a
+/// symbol reference. If no match is possible, this returns true, otherwise it
+/// returns false.
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return true;
+
+ SDValue N0 = N.getOperand(0);
+ CodeModel::Model M = TM.getCodeModel();
+
+ // Handle X86-64 rip-relative addresses. We check this before checking direct
+ // folding because RIP is preferable to non-RIP accesses.
+ if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+ // they cannot be folded into immediate fields.
+ // FIXME: This can be improved for kernel and other models?
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ // Base and index reg must be 0 in order to use %rip as base.
+ if (AM.hasBaseOrIndexReg())
+ return true;
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ if (FoldOffsetIntoAddress(G->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.SymbolFlags = CP->getTargetFlags();
+ if (FoldOffsetIntoAddress(CP->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ X86ISelAddressMode Backup = AM;
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+ AM = Backup;
+ return true;
+ }
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ if (N.getOpcode() == X86ISD::WrapperRIP)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+ return false;
+ }
+
+ // Handle the case when globals fit in our immediate field: This is true for
+ // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
+ // mode, this only applies to a non-RIP-relative computation.
+ if (!Subtarget->is64Bit() ||
+ M == CodeModel::Small || M == CodeModel::Kernel) {
+ assert(N.getOpcode() != X86ISD::WrapperRIP &&
+ "RIP-relative addressing already handled");
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.Disp += G->getOffset();
+ AM.SymbolFlags = G->getTargetFlags();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += CP->getOffset();
+ AM.SymbolFlags = CP->getTargetFlags();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.Disp += BA->getOffset();
+ AM.SymbolFlags = BA->getTargetFlags();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+ return false;
+ }
+
+ return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done. This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
+ if (MatchAddressRecursively(N, AM, 0))
+ return true;
+
+ // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+ // a smaller encoding and avoids a scaled-index.
+ if (AM.Scale == 2 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0) {
+ AM.Base_Reg = AM.IndexReg;
+ AM.Scale = 1;
+ }
+
+ // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+ // because it has a smaller encoding.
+ // TODO: Which other code models can use this?
+ if (TM.getCodeModel() == CodeModel::Small &&
+ Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0 &&
+ AM.IndexReg.getNode() == 0 &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+ return false;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+ if (N.getNode()->getNodeId() == -1 ||
+ N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+ DAG.RepositionNode(Pos.getNode(), N.getNode());
+ N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+ }
+}
+
+// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
+// allows us to convert the shift and and into an h-register extract and
+// a scaled index. Returns false if the simplification is performed.
+static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ !Shift.hasOneUse())
+ return true;
+
+ int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+ if (ScaleLog <= 0 || ScaleLog >= 4 ||
+ Mask != (0xffu << ScaleLog))
+ return true;
+
+ EVT VT = N.getValueType();
+ DebugLoc DL = N.getDebugLoc();
+ SDValue Eight = DAG.getConstant(8, MVT::i8);
+ SDValue NewMask = DAG.getConstant(0xff, VT);
+ SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+ SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ InsertDAGNode(DAG, N, Eight);
+ InsertDAGNode(DAG, N, Srl);
+ InsertDAGNode(DAG, N, NewMask);
+ InsertDAGNode(DAG, N, And);
+ InsertDAGNode(DAG, N, ShlCount);
+ InsertDAGNode(DAG, N, Shl);
+ DAG.ReplaceAllUsesWith(N, Shl);
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SHL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ return true;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+ return true;
+
+ EVT VT = N.getValueType();
+ DebugLoc DL = N.getDebugLoc();
+ SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+ SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ InsertDAGNode(DAG, N, NewMask);
+ InsertDAGNode(DAG, N, NewAnd);
+ InsertDAGNode(DAG, N, NewShift);
+ DAG.ReplaceAllUsesWith(N, NewShift);
+
+ AM.Scale = 1 << ShiftAmt;
+ AM.IndexReg = NewAnd;
+ return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+// int f(short *y, int *lookup_table) {
+// ...
+// return *y + lookup_table[*y >> 11];
+// }
+//
+// Turning into:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $11, %ecx
+// addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $9, %ecx
+// andl $124, %rcx
+// addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ unsigned MaskLZ = CountLeadingZeros_64(Mask);
+ unsigned MaskTZ = CountTrailingZeros_64(Mask);
+
+ // The amount of shift we're trying to fit into the addressing mode is taken
+ // from the trailing zeros of the mask.
+ unsigned AMShiftAmt = MaskTZ;
+
+ // There is nothing we can do here unless the mask is removing some bits.
+ // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+ if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+ // We also need to ensure that mask is a continuous run of bits.
+ if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+ // Scale the leading zero count down based on the actual size of the value.
+ // Also scale it down based on the size of the shift.
+ MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
+
+ // The final check is to ensure that any masked out high bits of X are
+ // already known to be zero. Otherwise, the mask has a semantic impact
+ // other than masking out a couple of low bits. Unfortunately, because of
+ // the mask, zero extensions will be removed from operands in some cases.
+ // This code works extra hard to look through extensions because we can
+ // replace them with zero extensions cheaply if necessary.
+ bool ReplacingAnyExtend = false;
+ if (X.getOpcode() == ISD::ANY_EXTEND) {
+ unsigned ExtendBits =
+ X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
+ // Assume that we'll replace the any-extend with a zero-extend, and
+ // narrow the search to the extended value.
+ X = X.getOperand(0);
+ MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+ ReplacingAnyExtend = true;
+ }
+ APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
+ MaskLZ);
+ APInt KnownZero, KnownOne;
+ DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
+ if (MaskedHighBits != KnownZero) return true;
+
+ // We've identified a pattern that can be transformed into a single shift
+ // and an addressing mode. Make it so.
+ EVT VT = N.getValueType();
+ if (ReplacingAnyExtend) {
+ assert(X.getValueType() != VT);
+ // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+ SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
+ InsertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+ DebugLoc DL = N.getDebugLoc();
+ SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
+ SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+ SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
+ SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ InsertDAGNode(DAG, N, NewSRLAmt);
+ InsertDAGNode(DAG, N, NewSRL);
+ InsertDAGNode(DAG, N, NewSHLAmt);
+ InsertDAGNode(DAG, N, NewSHL);
+ DAG.ReplaceAllUsesWith(N, NewSHL);
+
+ AM.Scale = 1 << AMShiftAmt;
+ AM.IndexReg = NewSRL;
+ return false;
+}
+
+bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ DebugLoc dl = N.getDebugLoc();
+ DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump();
+ });
+ // Limit recursion.
+ if (Depth > 5)
+ return MatchAddressBase(N, AM);
+
+ // If this is already a %rip relative address, we can only merge immediates
+ // into it. Instead of handling this in every case, we handle it here.
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRelative()) {
+ // FIXME: JumpTable and ExternalSymbol address currently don't like
+ // displacements. It isn't very important, but this should be fixed for
+ // consistency.
+ if (!AM.ES && AM.JT != -1) return true;
+
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+ if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM))
+ return false;
+ return true;
+ }
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!FoldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
+
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ if (!MatchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::LOAD:
+ if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0 &&
+ (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+ break;
+
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ unsigned Val = CN->getZExtValue();
+ // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+ // that the base operand remains free for further matching. If
+ // the base doesn't end up getting used, a post-processing step
+ // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDValue ShVal = N.getNode()->getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+ AM.IndexReg = ShVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+ if (!FoldOffsetIntoAddress(Disp, AM))
+ return false;
+ }
+
+ AM.IndexReg = ShVal;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SRL: {
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+ SDValue And = N.getOperand(0);
+ if (And.getOpcode() != ISD::AND) break;
+ SDValue X = And.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getValueSizeInBits() > 64) break;
+
+ // The mask used for the transform is expected to be post-shift, but we
+ // found the shift first so just apply the shift to the mask before passing
+ // it down.
+ if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+ !isa<ConstantSDNode>(And.getOperand(1)))
+ break;
+ uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into the scale, and return false if we
+ // succeed.
+ if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+ return false;
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI:
+ // A mul_lohi where we need the low part can be folded as a plain multiply.
+ if (N.getResNo() != 0) break;
+ // FALL THROUGH
+ case ISD::MUL:
+ case X86ISD::MUL_IMM:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0 &&
+ AM.IndexReg.getNode() == 0) {
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+ CN->getZExtValue() == 9) {
+ AM.Scale = unsigned(CN->getZExtValue())-1;
+
+ SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+ Reg = MulVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+ if (FoldOffsetIntoAddress(Disp, AM))
+ Reg = N.getNode()->getOperand(0);
+ } else {
+ Reg = N.getNode()->getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base_Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SUB: {
+ // Given A-B, if A can be completely folded into the address and
+ // the index field with the index field unused, use -B as the index.
+ // This is a win if a has multiple parts that can be folded into
+ // the address. Also, this saves a mov if the base register has
+ // other uses, since it avoids a two-address sub instruction, however
+ // it costs an additional mov if the index register has other uses.
+
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ // Test if the LHS of the sub can be folded.
+ X86ISelAddressMode Backup = AM;
+ if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+ AM = Backup;
+ break;
+ }
+ // Test if the index field is free for use.
+ if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+ AM = Backup;
+ break;
+ }
+
+ int Cost = 0;
+ SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+ // If the RHS involves a register with multiple uses, this
+ // transformation incurs an extra mov, due to the neg instruction
+ // clobbering its operand.
+ if (!RHS.getNode()->hasOneUse() ||
+ RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+ RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+ RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+ (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ ++Cost;
+ // If the base is a register with multiple uses, this
+ // transformation may save a mov.
+ if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() &&
+ !AM.Base_Reg.getNode()->hasOneUse()) ||
+ AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ --Cost;
+ // If the folded LHS was interesting, this transformation saves
+ // address arithmetic.
+ if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+ ((AM.Disp != 0) && (Backup.Disp == 0)) +
+ (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+ --Cost;
+ // If it doesn't look like it may be an overall win, don't do it.
+ if (Cost >= 0) {
+ AM = Backup;
+ break;
+ }
+
+ // Ok, the transformation is legal and appears profitable. Go for it.
+ SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+ SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+ AM.IndexReg = Neg;
+ AM.Scale = 1;
+
+ // Insert the new nodes into the topological ordering.
+ InsertDAGNode(*CurDAG, N, Zero);
+ InsertDAGNode(*CurDAG, N, Neg);
+ return false;
+ }
+
+ case ISD::ADD: {
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ X86ISelAddressMode Backup = AM;
+ if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+ !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // Try again after commuting the operands.
+ if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
+ !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base_Reg.getNode() &&
+ !AM.IndexReg.getNode()) {
+ N = Handle.getValue();
+ AM.Base_Reg = N.getOperand(0);
+ AM.IndexReg = N.getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ N = Handle.getValue();
+ break;
+ }
+
+ case ISD::OR:
+ // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ X86ISelAddressMode Backup = AM;
+ ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
+
+ // Start with the LHS as an addr mode.
+ if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+ !FoldOffsetIntoAddress(CN->getSExtValue(), AM))
+ return false;
+ AM = Backup;
+ }
+ break;
+
+ case ISD::AND: {
+ // Perform some heroic transforms on an and of a constant-count shift
+ // with a constant to enable use of the scaled offset field.
+
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+ SDValue Shift = N.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+ SDValue X = Shift.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getValueSizeInBits() > 64) break;
+
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ break;
+ uint64_t Mask = N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into an extract and scale.
+ if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift directly into the scale.
+ if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to swap the mask and shift to place shifts which can be done as
+ // a scale on the outside of the mask.
+ if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+ break;
+ }
+ }
+
+ return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+ // If so, check to see if the scale index register is set.
+ if (AM.IndexReg.getNode() == 0) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base_Reg = N;
+ return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched. It
+/// is always a load, store, atomic node, or null. It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ if (Parent &&
+ // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+ // that are not a MemSDNode, and thus don't have proper addrspace info.
+ Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+ Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+ Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+ Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+ Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+ unsigned AddrSpace =
+ cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+ // AddrSpace 256 -> GS, 257 -> FS.
+ if (AddrSpace == 256)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == 257)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ }
+
+ if (MatchAddress(N, AM))
+ return false;
+
+ EVT VT = N.getValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base_Reg.getNode())
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ }
+
+ if (!AM.IndexReg.getNode())
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to
+/// match a load whose top elements are either undef or zeros. The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+/// PatternChainNode: this is the matched node that has a chain input and
+/// output.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
+ SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment,
+ SDValue &PatternNodeWithChain) {
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ PatternNodeWithChain = N.getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ PatternNodeWithChain.hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ return true;
+ }
+ }
+
+ // Also handle the case where we explicitly require zeros in the top
+ // elements. This is a vector shuffle from the zero vector.
+ if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+ // Check to see if the top elements are all zeros (or bitcast of zeros).
+ N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N.getOperand(0).getNode()->hasOneUse() &&
+ ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+ N.getOperand(0).getOperand(0).hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+ if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ PatternNodeWithChain = SDValue(LD, 0);
+ return true;
+ }
+ return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+ // segments.
+ SDValue Copy = AM.Segment;
+ SDValue T = CurDAG->getRegister(0, MVT::i32);
+ AM.Segment = T;
+ if (MatchAddress(N, AM))
+ return false;
+ assert (T == AM.Segment);
+ AM.Segment = Copy;
+
+ EVT VT = N.getValueType();
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase)
+ if (AM.Base_Reg.getNode())
+ Complexity = 1;
+ else
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.getNode())
+ Complexity++;
+ else
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some expermentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.hasSymbolicDisplacement()) {
+ // For X86-64, we should always use lea to materialize RIP relative
+ // addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+ Complexity++;
+
+ // If it isn't worth using an LEA, reject it.
+ if (Complexity <= 2)
+ return false;
+
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+ X86ISelAddressMode AM;
+ AM.GV = GA->getGlobal();
+ AM.Disp += GA->getOffset();
+ AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+ AM.SymbolFlags = GA->getTargetFlags();
+
+ if (N.getValueType() == MVT::i32) {
+ AM.Scale = 1;
+ AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+ } else {
+ AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+ }
+
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+
+bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ !IsProfitableToFold(N, P, P) ||
+ !IsLegalToFold(N, P, P, OptLevel))
+ return false;
+
+ return SelectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+ return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+ SDValue Chain = Node->getOperand(0);
+ SDValue In1 = Node->getOperand(1);
+ SDValue In2L = Node->getOperand(2);
+ SDValue In2H = Node->getOperand(3);
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+ return NULL;
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+ const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+ MVT::i32, MVT::i32, MVT::Other, Ops);
+ cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+ return ResNode;
+}
+
+/// Atomic opcode table
+///
+enum AtomicOpc {
+ ADD,
+ SUB,
+ INC,
+ DEC,
+ OR,
+ AND,
+ XOR,
+ AtomicOpcEnd
+};
+
+enum AtomicSz {
+ ConstantI8,
+ I8,
+ SextConstantI16,
+ ConstantI16,
+ I16,
+ SextConstantI32,
+ ConstantI32,
+ I32,
+ SextConstantI64,
+ ConstantI64,
+ I64,
+ AtomicSzEnd
+};
+
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+ {
+ X86::LOCK_ADD8mi,
+ X86::LOCK_ADD8mr,
+ X86::LOCK_ADD16mi8,
+ X86::LOCK_ADD16mi,
+ X86::LOCK_ADD16mr,
+ X86::LOCK_ADD32mi8,
+ X86::LOCK_ADD32mi,
+ X86::LOCK_ADD32mr,
+ X86::LOCK_ADD64mi8,
+ X86::LOCK_ADD64mi32,
+ X86::LOCK_ADD64mr,
+ },
+ {
+ X86::LOCK_SUB8mi,
+ X86::LOCK_SUB8mr,
+ X86::LOCK_SUB16mi8,
+ X86::LOCK_SUB16mi,
+ X86::LOCK_SUB16mr,
+ X86::LOCK_SUB32mi8,
+ X86::LOCK_SUB32mi,
+ X86::LOCK_SUB32mr,
+ X86::LOCK_SUB64mi8,
+ X86::LOCK_SUB64mi32,
+ X86::LOCK_SUB64mr,
+ },
+ {
+ 0,
+ X86::LOCK_INC8m,
+ 0,
+ 0,
+ X86::LOCK_INC16m,
+ 0,
+ 0,
+ X86::LOCK_INC32m,
+ 0,
+ 0,
+ X86::LOCK_INC64m,
+ },
+ {
+ 0,
+ X86::LOCK_DEC8m,
+ 0,
+ 0,
+ X86::LOCK_DEC16m,
+ 0,
+ 0,
+ X86::LOCK_DEC32m,
+ 0,
+ 0,
+ X86::LOCK_DEC64m,
+ },
+ {
+ X86::LOCK_OR8mi,
+ X86::LOCK_OR8mr,
+ X86::LOCK_OR16mi8,
+ X86::LOCK_OR16mi,
+ X86::LOCK_OR16mr,
+ X86::LOCK_OR32mi8,
+ X86::LOCK_OR32mi,
+ X86::LOCK_OR32mr,
+ X86::LOCK_OR64mi8,
+ X86::LOCK_OR64mi32,
+ X86::LOCK_OR64mr,
+ },
+ {
+ X86::LOCK_AND8mi,
+ X86::LOCK_AND8mr,
+ X86::LOCK_AND16mi8,
+ X86::LOCK_AND16mi,
+ X86::LOCK_AND16mr,
+ X86::LOCK_AND32mi8,
+ X86::LOCK_AND32mi,
+ X86::LOCK_AND32mr,
+ X86::LOCK_AND64mi8,
+ X86::LOCK_AND64mi32,
+ X86::LOCK_AND64mr,
+ },
+ {
+ X86::LOCK_XOR8mi,
+ X86::LOCK_XOR8mr,
+ X86::LOCK_XOR16mi8,
+ X86::LOCK_XOR16mi,
+ X86::LOCK_XOR16mr,
+ X86::LOCK_XOR32mi8,
+ X86::LOCK_XOR32mi,
+ X86::LOCK_XOR32mr,
+ X86::LOCK_XOR64mi8,
+ X86::LOCK_XOR64mi32,
+ X86::LOCK_XOR64mr,
+ }
+};
+
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+ DebugLoc dl,
+ enum AtomicOpc &Op, EVT NVT,
+ SDValue Val) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+ int64_t CNVal = CN->getSExtValue();
+ // Quit if not 32-bit imm.
+ if ((int32_t)CNVal != CNVal)
+ return Val;
+ // For atomic-load-add, we could do some optimizations.
+ if (Op == ADD) {
+ // Translate to INC/DEC if ADD by 1 or -1.
+ if ((CNVal == 1) || (CNVal == -1)) {
+ Op = (CNVal == 1) ? INC : DEC;
+ // No more constant operand after being translated into INC/DEC.
+ return SDValue();
+ }
+ // Translate to SUB if ADD by negative value.
+ if (CNVal < 0) {
+ Op = SUB;
+ CNVal = -CNVal;
+ }
+ }
+ return CurDAG->getTargetConstant(CNVal, NVT);
+ }
+
+ // If the value operand is single-used, try to optimize it.
+ if (Op == ADD && Val.hasOneUse()) {
+ // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+ if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+ Op = SUB;
+ return Val.getOperand(1);
+ }
+ // A special case for i16, which needs truncating as, in most cases, it's
+ // promoted to i32. We will translate
+ // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+ if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+ Val.getOperand(0).getOpcode() == ISD::SUB &&
+ X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+ Op = SUB;
+ Val = Val.getOperand(0);
+ return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+ Val.getOperand(1));
+ }
+ }
+
+ return Val;
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+ if (Node->hasAnyUseOfValue(0))
+ return 0;
+
+ DebugLoc dl = Node->getDebugLoc();
+
+ // Optimize common patterns for __sync_or_and_fetch and similar arith
+ // operations where the result is not used. This allows us to use the "lock"
+ // version of the arithmetic instruction.
+ SDValue Chain = Node->getOperand(0);
+ SDValue Ptr = Node->getOperand(1);
+ SDValue Val = Node->getOperand(2);
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+ return 0;
+
+ // Which index into the table.
+ enum AtomicOpc Op;
+ switch (Node->getOpcode()) {
+ default:
+ return 0;
+ case ISD::ATOMIC_LOAD_OR:
+ Op = OR;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ Op = AND;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ Op = XOR;
+ break;
+ case ISD::ATOMIC_LOAD_ADD:
+ Op = ADD;
+ break;
+ }
+
+ Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+ bool isUnOp = !Val.getNode();
+ bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
+
+ unsigned Opc = 0;
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8:
+ if (isCN)
+ Opc = AtomicOpcTbl[Op][ConstantI8];
+ else
+ Opc = AtomicOpcTbl[Op][I8];
+ break;
+ case MVT::i16:
+ if (isCN) {
+ if (immSext8(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][SextConstantI16];
+ else
+ Opc = AtomicOpcTbl[Op][ConstantI16];
+ } else
+ Opc = AtomicOpcTbl[Op][I16];
+ break;
+ case MVT::i32:
+ if (isCN) {
+ if (immSext8(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][SextConstantI32];
+ else
+ Opc = AtomicOpcTbl[Op][ConstantI32];
+ } else
+ Opc = AtomicOpcTbl[Op][I32];
+ break;
+ case MVT::i64:
+ Opc = AtomicOpcTbl[Op][I64];
+ if (isCN) {
+ if (immSext8(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][SextConstantI64];
+ else if (i64immSExt32(Val.getNode()))
+ Opc = AtomicOpcTbl[Op][ConstantI64];
+ }
+ break;
+ }
+
+ assert(Opc != 0 && "Invalid arith lock transform!");
+
+ SDValue Ret;
+ SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ dl, NVT), 0);
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+ if (isUnOp) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+ Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+ } else {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+ Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+ }
+ cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+ SDValue RetVals[] = { Undef, Ret };
+ return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+}
+
+/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
+/// any uses which require the SF or OF bits to be accurate.
+static bool HasNoSignedComparisonUses(SDNode *N) {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); UI != UE; ++UI) {
+ // Only examine CopyToReg uses.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+ X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(),
+ FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1) continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode()) return false;
+ // Examine the opcode of the user.
+ switch (FlagUI->getMachineOpcode()) {
+ // These comparisons don't treat the most significant bit specially.
+ case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+ case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+ case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+ case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+ case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4:
+ case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4:
+ case X86::CMOVA16rr: case X86::CMOVA16rm:
+ case X86::CMOVA32rr: case X86::CMOVA32rm:
+ case X86::CMOVA64rr: case X86::CMOVA64rm:
+ case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+ case X86::CMOVB16rr: case X86::CMOVB16rm:
+ case X86::CMOVB32rr: case X86::CMOVB32rm:
+ case X86::CMOVB64rr: case X86::CMOVB64rm:
+ case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+ case X86::CMOVE16rr: case X86::CMOVE16rm:
+ case X86::CMOVE32rr: case X86::CMOVE32rm:
+ case X86::CMOVE64rr: case X86::CMOVE64rm:
+ case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+ case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+ case X86::CMOVP16rr: case X86::CMOVP16rm:
+ case X86::CMOVP32rr: case X86::CMOVP32rm:
+ case X86::CMOVP64rr: case X86::CMOVP64rm:
+ continue;
+ // Anything else: assume conservatively.
+ default: return false;
+ }
+ }
+ }
+ return true;
+}
+
+/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
+/// is suitable for doing the {load; increment or decrement; store} to modify
+/// transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+ SDValue StoredVal, SelectionDAG *CurDAG,
+ LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+ // is the value stored the result of a DEC or INC?
+ if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+ // is the stored value result 0 of the load?
+ if (StoredVal.getResNo() != 0) return false;
+
+ // are there other uses of the loaded value than the inc or dec?
+ if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+ // is the store non-extending and non-indexed?
+ if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+ return false;
+
+ SDValue Load = StoredVal->getOperand(0);
+ // Is the stored value a non-extending and non-indexed load?
+ if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+ // Return LoadNode by reference.
+ LoadNode = cast<LoadSDNode>(Load);
+ // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+ EVT LdVT = LoadNode->getMemoryVT();
+ if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+ LdVT != MVT::i8)
+ return false;
+
+ // Is store the only read of the loaded value?
+ if (!Load.hasOneUse())
+ return false;
+
+ // Is the address of the store the same as the load?
+ if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+ LoadNode->getOffset() != StoreNode->getOffset())
+ return false;
+
+ // Check if the chain is produced by the load or is a TokenFactor with
+ // the load output chain as an operand. Return InputChain by reference.
+ SDValue Chain = StoreNode->getChain();
+
+ bool ChainCheck = false;
+ if (Chain == Load.getValue(1)) {
+ ChainCheck = true;
+ InputChain = LoadNode->getChain();
+ } else if (Chain.getOpcode() == ISD::TokenFactor) {
+ SmallVector<SDValue, 4> ChainOps;
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+ SDValue Op = Chain.getOperand(i);
+ if (Op == Load.getValue(1)) {
+ ChainCheck = true;
+ continue;
+ }
+
+ // Make sure using Op as part of the chain would not cause a cycle here.
+ // In theory, we could check whether the chain node is a predecessor of
+ // the load. But that can be very expensive. Instead visit the uses and
+ // make sure they all have smaller node id than the load.
+ int LoadId = LoadNode->getNodeId();
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = UI->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != 0)
+ continue;
+ if (UI->getNodeId() > LoadId)
+ return false;
+ }
+
+ ChainOps.push_back(Op);
+ }
+
+ if (ChainCheck)
+ // Make a new TokenFactor with all the other input chains except
+ // for the load.
+ InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+ MVT::Other, &ChainOps[0], ChainOps.size());
+ }
+ if (!ChainCheck)
+ return false;
+
+ return true;
+}
+
+/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
+/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+ if (Opc == X86ISD::DEC) {
+ if (LdVT == MVT::i64) return X86::DEC64m;
+ if (LdVT == MVT::i32) return X86::DEC32m;
+ if (LdVT == MVT::i16) return X86::DEC16m;
+ if (LdVT == MVT::i8) return X86::DEC8m;
+ } else {
+ assert(Opc == X86ISD::INC && "unrecognized opcode");
+ if (LdVT == MVT::i64) return X86::INC64m;
+ if (LdVT == MVT::i32) return X86::INC32m;
+ if (LdVT == MVT::i16) return X86::INC16m;
+ if (LdVT == MVT::i8) return X86::INC8m;
+ }
+ llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+ // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+ SDValue Chain = Node->getOperand(0);
+ SDValue VSrc = Node->getOperand(2);
+ SDValue Base = Node->getOperand(3);
+ SDValue VIdx = Node->getOperand(4);
+ SDValue VMask = Node->getOperand(5);
+ ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+ if (!Scale)
+ return 0;
+
+ SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+ MVT::Other);
+
+ // Memory Operands: Base, Scale, Index, Disp, Segment
+ SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+ const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+ Disp, Segment, VMask, Chain};
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), VTs, Ops);
+ // Node has 2 outputs: VDst and MVT::Other.
+ // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+ // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+ // of ResNode.
+ ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+ return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+ EVT NVT = Node->getValueType(0);
+ unsigned Opc, MOpc;
+ unsigned Opcode = Node->getOpcode();
+ DebugLoc dl = Node->getDebugLoc();
+
+ DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+ if (Node->isMachineOpcode()) {
+ DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ return NULL; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_avx2_gather_d_pd:
+ case Intrinsic::x86_avx2_gather_d_pd_256:
+ case Intrinsic::x86_avx2_gather_q_pd:
+ case Intrinsic::x86_avx2_gather_q_pd_256:
+ case Intrinsic::x86_avx2_gather_d_ps:
+ case Intrinsic::x86_avx2_gather_d_ps_256:
+ case Intrinsic::x86_avx2_gather_q_ps:
+ case Intrinsic::x86_avx2_gather_q_ps_256:
+ case Intrinsic::x86_avx2_gather_d_q:
+ case Intrinsic::x86_avx2_gather_d_q_256:
+ case Intrinsic::x86_avx2_gather_q_q:
+ case Intrinsic::x86_avx2_gather_q_q_256:
+ case Intrinsic::x86_avx2_gather_d_d:
+ case Intrinsic::x86_avx2_gather_d_d_256:
+ case Intrinsic::x86_avx2_gather_q_d:
+ case Intrinsic::x86_avx2_gather_q_d_256: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break;
+ case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+ case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break;
+ case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break;
+ case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break;
+ case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break;
+ case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break;
+ case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
+ case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
+ }
+ SDNode *RetVal = SelectGather(Node, Opc);
+ if (RetVal)
+ // We already called ReplaceUses inside SelectGather.
+ return NULL;
+ break;
+ }
+ }
+ break;
+ }
+ case X86ISD::GlobalBaseReg:
+ return getGlobalBaseReg();
+
+
+ case X86ISD::ATOMOR64_DAG:
+ case X86ISD::ATOMXOR64_DAG:
+ case X86ISD::ATOMADD64_DAG:
+ case X86ISD::ATOMSUB64_DAG:
+ case X86ISD::ATOMNAND64_DAG:
+ case X86ISD::ATOMAND64_DAG:
+ case X86ISD::ATOMMAX64_DAG:
+ case X86ISD::ATOMMIN64_DAG:
+ case X86ISD::ATOMUMAX64_DAG:
+ case X86ISD::ATOMUMIN64_DAG:
+ case X86ISD::ATOMSWAP64_DAG: {
+ unsigned Opc;
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break;
+ case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break;
+ case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break;
+ case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break;
+ case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+ case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break;
+ case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break;
+ case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break;
+ case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
+ case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
+ case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+ }
+ SDNode *RetVal = SelectAtomic64(Node, Opc);
+ if (RetVal)
+ return RetVal;
+ break;
+ }
+
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_ADD: {
+ SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+ if (RetVal)
+ return RetVal;
+ break;
+ }
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ // For operations of the form (x << C1) op C2, check if we can use a smaller
+ // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+ break;
+
+ // i8 is unshrinkable, i16 should be promoted to i32.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ break;
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!Cst || !ShlCst)
+ break;
+
+ int64_t Val = Cst->getSExtValue();
+ uint64_t ShlVal = ShlCst->getZExtValue();
+
+ // Make sure that we don't change the operation by removing bits.
+ // This only matters for OR and XOR, AND is unaffected.
+ uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+ if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ break;
+
+ unsigned ShlOp, Op;
+ EVT CstVT = NVT;
+
+ // Check the minimum bitwidth for the new constant.
+ // TODO: AND32ri is the same as AND64ri32 with zext imm.
+ // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+ // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+ if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+ CstVT = MVT::i8;
+ else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+ CstVT = MVT::i32;
+
+ // Bail if there is no smaller encoding.
+ if (NVT == CstVT)
+ break;
+
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i32:
+ assert(CstVT == MVT::i8);
+ ShlOp = X86::SHL32ri;
+
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case ISD::AND: Op = X86::AND32ri8; break;
+ case ISD::OR: Op = X86::OR32ri8; break;
+ case ISD::XOR: Op = X86::XOR32ri8; break;
+ }
+ break;
+ case MVT::i64:
+ assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+ ShlOp = X86::SHL64ri;
+
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible opcode");
+ case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+ case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
+ case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+ }
+ break;
+ }
+
+ // Emit the smaller op and the shift.
+ SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
+ SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+ return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+ getI8Imm(ShlVal));
+ }
+ case X86ISD::UMUL: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ unsigned LoReg;
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break;
+ case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
+ case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+ case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+ SDValue Ops[] = {N1, InFlag};
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
+ return NULL;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SMUL_LOHI;
+ bool hasBMI2 = Subtarget->hasBMI2();
+ if (!isSigned) {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
+ case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+ case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+ MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+ case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+ MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+ }
+ } else {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
+ case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+ case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+ case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+ }
+ }
+
+ unsigned SrcReg, LoReg, HiReg;
+ switch (Opc) {
+ default: llvm_unreachable("Unknown MUL opcode!");
+ case X86::IMUL8r:
+ case X86::MUL8r:
+ SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+ break;
+ case X86::IMUL16r:
+ case X86::MUL16r:
+ SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+ break;
+ case X86::IMUL32r:
+ case X86::MUL32r:
+ SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+ break;
+ case X86::IMUL64r:
+ case X86::MUL64r:
+ SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+ break;
+ case X86::MULX32rr:
+ SrcReg = X86::EDX; LoReg = HiReg = 0;
+ break;
+ case X86::MULX64rr:
+ SrcReg = X86::RDX; LoReg = HiReg = 0;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // Multiply is commmutative.
+ if (!foldedLoad) {
+ foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (foldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+ N0, SDValue()).getValue(1);
+ SDValue ResHi, ResLo;
+
+ if (foldedLoad) {
+ SDValue Chain;
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ Chain = SDValue(CNode, 2);
+ InFlag = SDValue(CNode, 3);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ Chain = SDValue(CNode, 0);
+ InFlag = SDValue(CNode, 1);
+ }
+
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), Chain);
+ } else {
+ SDValue Ops[] = { N1, InFlag };
+ if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ InFlag = SDValue(CNode, 2);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 0);
+ }
+ }
+
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ if (HiReg == X86::AH && Subtarget->is64Bit() &&
+ !SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ // Get the low part if needed. Don't use getCopyFromReg for aliasing
+ // registers.
+ if (!SDValue(Node, 0).use_empty())
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+ // Shift AX down 8 bits.
+ Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, MVT::i8)), 0);
+ // Then truncate it down to i8.
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+ }
+ // Copy the low half of the result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ if (ResLo.getNode() == 0) {
+ assert(LoReg && "Register for low half is not defined!");
+ ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+ InFlag);
+ InFlag = ResLo.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 0), ResLo);
+ DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the high half of the result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ if (ResHi.getNode() == 0) {
+ assert(HiReg && "Register for high half is not defined!");
+ ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+ InFlag);
+ InFlag = ResHi.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 1), ResHi);
+ DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+
+ // Propagate ordering to the last node, for now.
+ CurDAG->AssignOrdering(InFlag.getNode(), CurDAG->GetOrdering(Node));
+
+ return NULL;
+ }
+
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SDIVREM;
+ if (!isSigned) {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ } else {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+ }
+
+ unsigned LoReg, HiReg, ClrReg;
+ unsigned ClrOpcode, SExtOpcode;
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; ClrReg = HiReg = X86::AH;
+ ClrOpcode = 0;
+ SExtOpcode = X86::CBW;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrOpcode = X86::MOV16r0; ClrReg = X86::DX;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+ ClrOpcode = X86::MOV32r0;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+ ClrOpcode = X86::MOV64r0;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+ SDValue InFlag;
+ if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+ if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+ MVT::Other, Ops), 0);
+ Chain = Move.getValue(1);
+ ReplaceUses(N0.getValue(1), Chain);
+ } else {
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+ InFlag = Chain.getValue(1);
+ } else {
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+ LoReg, N0, SDValue()).getValue(1);
+ if (isSigned && !signBitIsZero) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDValue ClrNode =
+ SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+ ClrNode, InFlag).getValue(1);
+ }
+ }
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+ }
+
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ // Shift it down 8 bits.
+ if (HiReg == X86::AH && Subtarget->is64Bit() &&
+ !SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+
+ // If we also need AL (the quotient), get it by extracting a subreg from
+ // Result. The fast register allocator does not like multiple CopyFromReg
+ // nodes using aliasing registers.
+ if (!SDValue(Node, 0).use_empty())
+ ReplaceUses(SDValue(Node, 0),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+ // Shift AX right by 8 bits instead of using AH.
+ Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, MVT::i8)),
+ 0);
+ ReplaceUses(SDValue(Node, 1),
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+ }
+ // Copy the division (low) result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 0), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the remainder (high) result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ return NULL;
+ }
+
+ case X86ISD::CMP:
+ case X86ISD::SUB: {
+ // Sometimes a SUB is used to perform comparison.
+ if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+ // This node is not a CMP.
+ break;
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+ // use a smaller encoding.
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+ HasNoSignedComparisonUses(Node))
+ // Look past the truncate if CMP is the only use of it.
+ N0 = N0.getOperand(0);
+ if ((N0.getNode()->getOpcode() == ISD::AND ||
+ (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+ N0.getNode()->hasOneUse() &&
+ N0.getValueType() != MVT::i8 &&
+ X86::isZeroNode(N1)) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+ if (!C) break;
+
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+ (!(C->getZExtValue() & 0x80) ||
+ HasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // On x86-32, only the ABCD registers have 8-bit subregisters.
+ if (!Subtarget->is64Bit()) {
+ const TargetRegisterClass *TRC;
+ switch (N0.getValueType().getSimpleVT().SimpleTy) {
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+ }
+
+ // Extract the l-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return NULL;
+ }
+
+ // For example, "testl %eax, $2048" to "testb %ah, $8".
+ if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ HasNoSignedComparisonUses(Node))) {
+ // Shift the immediate right by 8 bits.
+ SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+ MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Put the value in an ABCD register.
+ const TargetRegisterClass *TRC;
+ switch (N0.getValueType().getSimpleVT().SimpleTy) {
+ case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+
+ // Extract the h-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
+ // target GR8_NOREX registers, so make sure the register class is
+ // forced.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+ MVT::i32, Subreg, ShiftedImm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return NULL;
+ }
+
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+ N0.getValueType() != MVT::i16 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ HasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 16-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+ MVT::i16, Reg);
+
+ // Emit a testw.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return NULL;
+ }
+
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+ N0.getValueType() == MVT::i64 &&
+ (!(C->getZExtValue() & 0x80000000) ||
+ HasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 32-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+ MVT::i32, Reg);
+
+ // Emit a testl.
+ SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+ Subreg, Imm);
+ // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+ // one, do not call ReplaceAllUsesWith.
+ ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+ SDValue(NewNode, 0));
+ return NULL;
+ }
+ }
+ break;
+ }
+ case ISD::STORE: {
+ // Change a chain of {load; incr or dec; store} of the same value into
+ // a simple increment or decrement through memory of that value, if the
+ // uses of the modified value and its address are suitable.
+ // The DEC64m tablegen pattern is currently not able to match the case where
+ // the EFLAGS on the original DEC are used. (This also applies to
+ // {INC,DEC}X{64,32,16,8}.)
+ // We'll need to improve tablegen to allow flags to be transferred from a
+ // node in the pattern to the result node. probably with a new keyword
+ // for example, we have this
+ // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ // (implicit EFLAGS)]>;
+ // but maybe need something like this
+ // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ // (transferrable EFLAGS)]>;
+
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+ SDValue StoredVal = StoreNode->getOperand(1);
+ unsigned Opc = StoredVal->getOpcode();
+
+ LoadSDNode *LoadNode = 0;
+ SDValue InputChain;
+ if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+ LoadNode, InputChain))
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+ MemOp[0] = StoreNode->getMemOperand();
+ MemOp[1] = LoadNode->getMemOperand();
+ const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+ EVT LdVT = LoadNode->getMemoryVT();
+ unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+ MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+ Node->getDebugLoc(),
+ MVT::i32, MVT::Other, Ops);
+ Result->setMemRefs(MemOp, MemOp + 2);
+
+ ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+
+ return Result;
+ }
+ }
+
+ SDNode *ResNode = SelectCode(Node);
+
+ DEBUG(dbgs() << "=> ";
+ if (ResNode == NULL || ResNode == Node)
+ Node->dump(CurDAG);
+ else
+ ResNode->dump(CurDAG);
+ dbgs() << '\n');
+
+ return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, Op2, Op3, Op4;
+ switch (ConstraintCode) {
+ case 'o': // offsetable ??
+ case 'v': // not offsetable ??
+ default: return true;
+ case 'm': // memory
+ if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ OutOps.push_back(Op4);
+ return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..f69f5d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,18598 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VariadicFunction.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <bitset>
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2);
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 instruction or a
+/// simple subregister reference. Idx is an index in the 128 bits we
+/// want. It need not be aligned to a 128-bit bounday. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, DebugLoc dl) {
+ EVT VT = Vec.getValueType();
+ assert(VT.is256BitVector() && "Unexpected vector size!");
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/128;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract from UNDEF is UNDEF.
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getUNDEF(ResultVT);
+
+ // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
+ // we can match to VEXTRACTF128.
+ unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+
+ // This is the index of the first element of the 128-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ * ElemsPerChunk);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+ SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+ VecIdx);
+
+ return Result;
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128 instruction or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit bounday. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ DebugLoc dl) {
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
+
+ EVT VT = Vec.getValueType();
+ assert(VT.is128BitVector() && "Unexpected vector size!");
+
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant 128 bits.
+ unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+
+ // This is the index of the first element of the 128-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+ * ElemsPerChunk);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+ VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ DebugLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ bool is64Bit = Subtarget->is64Bit();
+
+ if (Subtarget->isTargetEnvMacho()) {
+ if (is64Bit)
+ return new X86_64MachoTargetObjectFile();
+ return new TargetLoweringObjectFileMachO();
+ }
+
+ if (Subtarget->isTargetLinux())
+ return new X86LinuxTargetObjectFile();
+ if (Subtarget->isTargetELF())
+ return new TargetLoweringObjectFileELF();
+ if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+ return new TargetLoweringObjectFileCOFF();
+ llvm_unreachable("unknown subtarget type");
+}
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+ : TargetLowering(TM, createTLOF(TM)) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ RegInfo = TM.getRegisterInfo();
+ TD = getDataLayout();
+
+ resetOperationActions();
+}
+
+void X86TargetLowering::resetOperationActions() {
+ const TargetMachine &TM = getTargetMachine();
+ static bool FirstTimeThrough = true;
+
+ // If none of the target options have changed, then we don't need to reset the
+ // operation actions.
+ if (!FirstTimeThrough && TO == TM.Options) return;
+
+ if (!FirstTimeThrough) {
+ // Reinitialize the actions.
+ initActions();
+ FirstTimeThrough = false;
+ }
+
+ TO = TM.Options;
+
+ // Set up the TargetLowering object.
+ static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+
+ // X86 is weird, it always uses i8 for shift amounts and setcc results.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // For 64-bit since we have so many registers use the ILP scheduler, for
+ // 32-bit code use the register pressure specific scheduling.
+ // For Atom, always use ILP scheduling.
+ if (Subtarget->isAtom())
+ setSchedulingPreference(Sched::ILP);
+ else if (Subtarget->is64Bit())
+ setSchedulingPreference(Sched::ILP);
+ else
+ setSchedulingPreference(Sched::RegPressure);
+ setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+ // Bypass expensive divides on Atom when compiling with O2
+ if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
+ addBypassSlowDiv(32, 8);
+ if (Subtarget->is64Bit())
+ addBypassSlowDiv(64, 16);
+ }
+
+ if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
+ // Setup Windows compiler runtime calls.
+ setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+ setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+ setLibcallName(RTLIB::SREM_I64, "_allrem");
+ setLibcallName(RTLIB::UREM_I64, "_aullrem");
+ setLibcallName(RTLIB::MUL_I64, "_allmul");
+ setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+ setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+
+ // The _ftol2 runtime function has an unusual calling conv, which
+ // is modeled by a special pseudo-instruction.
+ setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
+ setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
+ setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
+ setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+ }
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(false);
+ setUseUnderscoreLongJmp(false);
+ } else if (Subtarget->isTargetMingw()) {
+ // MS runtime is weird: it exports _setjmp, but longjmp!
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(false);
+ } else {
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+ }
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &X86::GR8RegClass);
+ addRegisterClass(MVT::i16, &X86::GR16RegClass);
+ addRegisterClass(MVT::i32, &X86::GR32RegClass);
+ if (Subtarget->is64Bit())
+ addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ // SETOEQ and SETUNE require checking two conditions.
+ setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ } else if (!TM.Options.UseSoftFloat) {
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ // We have an algorithm for SSE2, and we turn this into a 64-bit
+ // FILD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ }
+
+ // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
+
+ if (!TM.Options.UseSoftFloat) {
+ // SSE has no i16 to fp conversion, only i32
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
+ }
+
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+
+ // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
+
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ } else if (!TM.Options.UseSoftFloat) {
+ // Since AVX is a superset of SSE3, only check for SSE here.
+ if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+ // Expand FP_TO_UINT into a select.
+ // FIXME: We would like to use a Custom expander here eventually to do
+ // the optimal thing for SSE vs. the default expansion in the legalizer.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
+ else
+ // With SSE3 we can use fisttpll to convert to a signed i64; without
+ // SSE, we're stuck with a fistpll.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+ }
+
+ if (isTargetFTOL()) {
+ // Use the _ftol2 runtime function, which has a pseudo-instruction
+ // to handle its weird calling convention.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ }
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSEf64) {
+ setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
+ setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
+ // Without SSE, i64->f64 goes through memory.
+ setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
+ }
+ }
+
+ // Scalar integer divide and remainder are lowered to use operations that
+ // produce two results, to match the available instructions. This exposes
+ // the two-result form to trivial CSE, which is able to combine x/y and x%y
+ // into a single instruction.
+ //
+ // Scalar integer multiply-high is also lowered to use two-result
+ // operations, to match the available instructions. However, plain multiply
+ // (low) operations are left as Legal, as there are single-result
+ // instructions for this in x86. Using the two-result multiply instructions
+ // when both high and low results are needed must be arranged by dagcombine.
+ for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
+ MVT VT = IntVTs[i];
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+
+ // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+ setOperationAction(ISD::ADDC, VT, Custom);
+ setOperationAction(ISD::ADDE, VT, Custom);
+ setOperationAction(ISD::SUBC, VT, Custom);
+ setOperationAction(ISD::SUBE, VT, Custom);
+ }
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ setOperationAction(ISD::BR_CC , MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f80, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i8, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i16, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+ setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ // Promote the i8 variants and force them on up to i32 which has a shorter
+ // encoding.
+ setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
+ if (Subtarget->hasBMI()) {
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+ } else {
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasLZCNT()) {
+ // When promoting the i8 variants, force them to i32 for a shorter
+ // encoding.
+ setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+ } else {
+ setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ }
+ }
+
+ if (Subtarget->hasPOPCNT()) {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
+ } else {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // These should be promoted to a larger select which is supported.
+ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
+ // X86 wants to expand cmov itself.
+ setOperationAction(ISD::SELECT , MVT::i8 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i16 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f64 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f80 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f80 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SELECT , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i64 , Custom);
+ }
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+ // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+ // support continuation, user-level threading, and etc.. As a result, no
+ // other SjLj exception interfaces are implemented and please don't build
+ // your own exception handling based on them.
+ // LLVM/Clang supports zero-cost DWARF exception handling.
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+ // Darwin ABI issue.
+ setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
+ setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
+ setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
+ }
+ // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+ setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasSSE1())
+ setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
+
+ setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
+
+ // Expand certain atomics
+ for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
+ MVT VT = IntVTs[i];
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+ }
+
+ if (!Subtarget->is64Bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
+ }
+
+ if (Subtarget->hasCmpxchg16b()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+ }
+
+ // FIXME - use subtarget debug flags
+ if (!Subtarget->isTargetDarwin() &&
+ !Subtarget->isTargetELF() &&
+ !Subtarget->isTargetCygMing()) {
+ setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
+ if (Subtarget->is64Bit()) {
+ setExceptionPointerRegister(X86::RAX);
+ setExceptionSelectorRegister(X86::RDX);
+ } else {
+ setExceptionPointerRegister(X86::EAX);
+ setExceptionSelectorRegister(X86::EDX);
+ }
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::VAARG , MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY , MVT::Other, Custom);
+ } else {
+ setOperationAction(ISD::VAARG , MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
+ else if (TM.Options.EnableSegmentedStacks)
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Expand);
+
+ if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
+ // f32 and f64 use SSE.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::FR64RegClass);
+
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f64, Custom);
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f64, Custom);
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // Lower this to FGETSIGNx86 plus an AND.
+ setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+ setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+ // Expand FP immediates into loads from the stack, except for the special
+ // cases we handle.
+ addLegalFPImmediate(APFloat(+0.0)); // xorpd
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
+ // Use SSE for f32, x87 for f64.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+ // Use ANDPS to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+
+ // Use ANDPS and ORPS to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+ // Special cases we handle for FP constants.
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ }
+ } else if (!TM.Options.UseSoftFloat) {
+ // f32 and f64 in x87.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+ addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+ setOperationAction(ISD::UNDEF, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ }
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+ addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+ }
+
+ // We don't support FMA.
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+ // Long double always uses X87.
+ if (!TM.Options.UseSoftFloat) {
+ addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+ {
+ APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
+ addLegalFPImmediate(TmpFlt); // FLD0
+ TmpFlt.changeSign();
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+
+ bool ignored;
+ APFloat TmpFlt2(+1.0);
+ TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.changeSign();
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ }
+
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f80, Expand);
+ setOperationAction(ISD::FCOS , MVT::f80, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+ }
+
+ setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f80, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+ setOperationAction(ISD::FRINT, MVT::f80, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+ setOperationAction(ISD::FMA, MVT::f80, Expand);
+ }
+
+ // Always use a library call for pow.
+ setOperationAction(ISD::FPOW , MVT::f32 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f64 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+
+ setOperationAction(ISD::FLOG, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+ // First set operation action for all vector types to either promote
+ // (for widening) or expand (for scalarization). Then we will selectively
+ // turn on ones that can be effectively codegen'd.
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+ i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+ setOperationAction(ISD::ADD , VT, Expand);
+ setOperationAction(ISD::SUB , VT, Expand);
+ setOperationAction(ISD::FADD, VT, Expand);
+ setOperationAction(ISD::FNEG, VT, Expand);
+ setOperationAction(ISD::FSUB, VT, Expand);
+ setOperationAction(ISD::MUL , VT, Expand);
+ setOperationAction(ISD::FMUL, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::LOAD, VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::FABS, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+ setOperationAction(ISD::SHL, VT, Expand);
+ setOperationAction(ISD::SRA, VT, Expand);
+ setOperationAction(ISD::SRL, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+ setOperationAction(ISD::TRUNCATE, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+ setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+ InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+ setTruncStoreAction(VT,
+ (MVT::SimpleValueType)InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+ }
+
+ // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+ // with -msoft-float, disable use of MMX as well.
+ if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+ // No operations on x86mmx supported, everything uses intrinsics.
+ }
+
+ // MMX-sized vectors (other than x86mmx) are expected to be expanded
+ // into smaller operations.
+ setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
+ setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
+ setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
+ setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
+ setOperationAction(ISD::AND, MVT::v8i8, Expand);
+ setOperationAction(ISD::AND, MVT::v4i16, Expand);
+ setOperationAction(ISD::AND, MVT::v2i32, Expand);
+ setOperationAction(ISD::AND, MVT::v1i64, Expand);
+ setOperationAction(ISD::OR, MVT::v8i8, Expand);
+ setOperationAction(ISD::OR, MVT::v4i16, Expand);
+ setOperationAction(ISD::OR, MVT::v2i32, Expand);
+ setOperationAction(ISD::OR, MVT::v1i64, Expand);
+ setOperationAction(ISD::XOR, MVT::v8i8, Expand);
+ setOperationAction(ISD::XOR, MVT::v4i16, Expand);
+ setOperationAction(ISD::XOR, MVT::v2i32, Expand);
+ setOperationAction(ISD::XOR, MVT::v1i64, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
+
+ if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
+ addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
+
+ setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ }
+
+ if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
+ addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
+
+ // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+ // registers cannot be used even for integer operations.
+ addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
+
+ setOperationAction(ISD::ADD, MVT::v16i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+ // Do not attempt to custom lower non-power-of-2 vectors
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
+ continue;
+ // Do not attempt to custom lower non-128-bit vectors
+ if (!VT.is128BitVector())
+ continue;
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ }
+
+ // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+ for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ // Do not attempt to promote non-128-bit vectors
+ if (!VT.is128BitVector())
+ continue;
+
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v2i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v2i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+ }
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ // As there is no 64-bit GPR available, we need build a special custom
+ // sequence to convert from v2i32 to v2f32.
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
+ }
+
+ if (Subtarget->hasSSE41()) {
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+
+ setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+
+ // FIXME: Do we need to handle scalar-to-vector here?
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+
+ // i8 and i16 vectors are custom , because the source register and source
+ // source memory operand types are not the same width. f32 vectors are
+ // custom since the immediate controlling the insert encodes additional
+ // information.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ // FIXME: these should be Legal but thats only for the case where
+ // the index is constant. For now custom expand to deal with that.
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ }
+ }
+
+ if (Subtarget->hasSSE2()) {
+ setOperationAction(ISD::SRL, MVT::v8i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v8i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i8, Custom);
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ setOperationAction(ISD::SRL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
+ }
+
+ if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
+ addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
+
+ setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
+
+ setOperationAction(ISD::FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f64, Custom);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::SRL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SDIV, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
+
+ if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
+ setOperationAction(ISD::FMA, MVT::v8f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMA, MVT::f32, Legal);
+ setOperationAction(ISD::FMA, MVT::f64, Legal);
+ }
+
+ if (Subtarget->hasInt256()) {
+ setOperationAction(ISD::ADD, MVT::v4i64, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v16i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v32i8, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v32i8, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v16i16, Legal);
+ // Don't lower v32i8 because there is no 128-bit byte mul
+
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+ setOperationAction(ISD::SDIV, MVT::v8i32, Custom);
+ } else {
+ setOperationAction(ISD::ADD, MVT::v4i64, Custom);
+ setOperationAction(ISD::ADD, MVT::v8i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v16i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v8i32, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, Custom);
+ // Don't lower v32i8 because there is no 128-bit byte mul
+ }
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ setOperationAction(ISD::SRL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i32, Custom);
+
+ // Custom lower several nodes for 256-bit types.
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+ i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ // Extract subvector is special because the value type
+ // (result) is 128-bit but the source is 256-bit wide.
+ if (VT.is128BitVector())
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ // Do not attempt to custom lower other non-256-bit vectors
+ if (!VT.is256BitVector())
+ continue;
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ }
+
+ // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+ for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ // Do not attempt to promote non-256-bit vectors
+ if (!VT.is256BitVector())
+ continue;
+
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v4i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v4i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+ }
+ }
+
+ // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
+ // of this type with custom code.
+ for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+ VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+ Custom);
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+ // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+ // handle type legalization for these operations here.
+ //
+ // FIXME: We really should do custom legalization for addition and
+ // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
+ // than generic legalization for 64-bit multiplication-with-overflow, though.
+ for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+ // Add/Sub/Mul with overflow operations are custom lowered.
+ MVT VT = IntVTs[i];
+ setOperationAction(ISD::SADDO, VT, Custom);
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::SSUBO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
+ setOperationAction(ISD::SMULO, VT, Custom);
+ setOperationAction(ISD::UMULO, VT, Custom);
+ }
+
+ // There are no 8-bit 3-address imul/mul instructions
+ setOperationAction(ISD::SMULO, MVT::i8, Expand);
+ setOperationAction(ISD::UMULO, MVT::i8, Expand);
+
+ if (!Subtarget->is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, 0);
+ setLibcallName(RTLIB::SRL_I128, 0);
+ setLibcallName(RTLIB::SRA_I128, 0);
+ }
+
+ // Combine sin / cos into one node or libcall if possible.
+ if (Subtarget->hasSinCos()) {
+ setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+ setLibcallName(RTLIB::SINCOS_F64, "sincos");
+ if (Subtarget->isTargetDarwin()) {
+ // For MacOSX, we don't want to the normal expansion of a libcall to
+ // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+ // traffic.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ }
+ }
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::SETCC);
+ if (Subtarget->is64Bit())
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::XOR);
+
+ computeRegisterProperties();
+
+ // On Darwin, -Os means optimize for size without hurting performance,
+ // do not reduce the limit.
+ MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+ MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ setPrefLoopAlignment(4); // 2^4 bytes.
+
+ // Predictable cmov don't hurt on atom because it's in-order.
+ PredictableSelectIsExpensive = !Subtarget->isAtom();
+
+ setPrefFunctionAlignment(4); // 2^4 bytes.
+}
+
+EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+ if (!VT.isVector()) return MVT::i8;
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+ if (MaxAlign == 16)
+ return;
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VTy->getBitWidth() == 128)
+ MaxAlign = 16;
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(ATy->getElementType(), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(STy->getElementType(i), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == 16)
+ break;
+ }
+ }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
+ if (Subtarget->is64Bit()) {
+ // Max of 8 and alignment of type.
+ unsigned TyAlign = TD->getABITypeAlignment(Ty);
+ if (TyAlign > 8)
+ return TyAlign;
+ return 8;
+ }
+
+ unsigned Align = 4;
+ if (Subtarget->hasSSE1())
+ getMaxByValAlign(Ty, Align);
+ return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ if ((!IsMemset || ZeroMemset) &&
+ !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) {
+ if (Size >= 16 &&
+ (Subtarget->isUnalignedMemAccessFast() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
+ if (Size >= 32) {
+ if (Subtarget->hasInt256())
+ return MVT::v8i32;
+ if (Subtarget->hasFp256())
+ return MVT::v8f32;
+ }
+ if (Subtarget->hasSSE2())
+ return MVT::v4i32;
+ if (Subtarget->hasSSE1())
+ return MVT::v4f32;
+ } else if (!MemcpyStrSrc && Size >= 8 &&
+ !Subtarget->is64Bit() &&
+ Subtarget->hasSSE2()) {
+ // Do not use f64 to lower memcpy if source is string constant. It's
+ // better to use i32 to avoid the loads.
+ return MVT::f64;
+ }
+ }
+ if (Subtarget->is64Bit() && Size >= 8)
+ return MVT::i64;
+ return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+ if (VT == MVT::f32)
+ return X86ScalarSSEf32;
+ else if (VT == MVT::f64)
+ return X86ScalarSSEf64;
+ return true;
+}
+
+bool
+X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+ if (Fast)
+ *Fast = Subtarget->isUnalignedMemAccessFast();
+ return true;
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function. The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+ // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+ // symbol.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ // Otherwise, use the normal jump table encoding heuristics.
+ return TargetLowering::getJumpTableEncoding();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,MCContext &Ctx) const{
+ assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT());
+ // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+ // entries.
+ return MCSymbolRefExpr::Create(MBB->getSymbol(),
+ MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (!Subtarget->is64Bit())
+ // This doesn't have DebugLoc associated with it, but is not really the
+ // same as a Register.
+ return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
+ return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+ MCContext &Ctx) const {
+ // X86-64 uses RIP relative addressing based on the jump table label.
+ if (Subtarget->isPICStyleRIPRel())
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+ // Otherwise, the reference is relative to the PIC base.
+ return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+}
+
+// FIXME: Why this routine is here? Move to RegInfo!
+std::pair<const TargetRegisterClass*, uint8_t>
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
+ const TargetRegisterClass *RRC = 0;
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(VT);
+ case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+ RRC = Subtarget->is64Bit() ?
+ (const TargetRegisterClass*)&X86::GR64RegClass :
+ (const TargetRegisterClass*)&X86::GR32RegClass;
+ break;
+ case MVT::x86mmx:
+ RRC = &X86::VR64RegClass;
+ break;
+ case MVT::f32: case MVT::f64:
+ case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+ case MVT::v4f32: case MVT::v2f64:
+ case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
+ case MVT::v4f64:
+ RRC = &X86::VR128RegClass;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+ unsigned &Offset) const {
+ if (!Subtarget->isTargetLinux())
+ return false;
+
+ if (Subtarget->is64Bit()) {
+ // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+ Offset = 0x28;
+ if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+ AddressSpace = 256;
+ else
+ AddressSpace = 257;
+ } else {
+ // %gs:0x14 on i386
+ Offset = 0x14;
+ AddressSpace = 256;
+ }
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ DebugLoc dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
+ MVT::i16));
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue ValToCopy = OutVals[i];
+ EVT ValVT = ValToCopy.getValueType();
+
+ // Promote values to the appropriate types
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt)
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values,
+ // or SSE or MMX vectors.
+ if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+ VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+ (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+ // Likewise we can't return F64 values with SSE1 only. gcc does so, but
+ // llvm-gcc has never done it right and no one has noticed, so this
+ // should be OK for now.
+ if (ValVT == MVT::f64 &&
+ (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+ report_fatal_error("SSE2 register return with SSE2 disabled");
+
+ // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+ // the RET instruction and handled by the FP Stackifier.
+ if (VA.getLocReg() == X86::ST0 ||
+ VA.getLocReg() == X86::ST1) {
+ // If this is a copy from an xmm register to ST(0), use an FPExtend to
+ // change the value to the FP stack register class.
+ if (isScalarFPTypeInSSEReg(VA.getValVT()))
+ ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+ RetOps.push_back(ValToCopy);
+ // Don't emit a copytoreg.
+ continue;
+ }
+
+ // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+ // which is returned in RAX / RDX.
+ if (Subtarget->is64Bit()) {
+ if (ValVT == MVT::x86mmx) {
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+ ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
+ ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ ValToCopy);
+ // If we don't have SSE2 available, convert to v4f32 so the generated
+ // register is legal.
+ if (!Subtarget->hasSSE2())
+ ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
+ }
+ }
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // The x86-64 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Win32 requires us to put the sret argument to %eax as well.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
+ (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments().");
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+ unsigned RetValReg
+ = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+ X86::RAX : X86::EAX;
+ Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ // RAX/EAX now acts like a return value.
+ RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(X86ISD::RET_FLAG, dl,
+ MVT::Other, &RetOps[0], RetOps.size());
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+ if (N->getNumValues() != 1)
+ return false;
+ if (!N->hasNUsesOfValue(1, 0))
+ return false;
+
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+ return false;
+
+ bool HasRet = false;
+ for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() != X86ISD::RET_FLAG)
+ return false;
+ HasRet = true;
+ }
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
+ return true;
+}
+
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
+ ISD::NodeType ExtendKind) const {
+ MVT ReturnMVT;
+ // TODO: Is this also valid on 32-bit?
+ if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+ ReturnMVT = MVT::i8;
+ else
+ ReturnMVT = MVT::i32;
+
+ MVT MinVT = getRegisterType(ReturnMVT);
+ return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ bool Is64Bit = Subtarget->is64Bit();
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getValVT();
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ SDValue Val;
+
+ // If this is a call to a function that returns an fp value on the floating
+ // point stack, we must guarantee the value is popped from the stack, so
+ // a CopyFromReg is not good enough - the copy instruction may be eliminated
+ // if the return value is not used. We use the FpPOP_RETVAL instruction
+ // instead.
+ if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
+ SDValue Ops[] = { Chain, InFlag };
+ Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
+ MVT::Other, MVT::Glue, Ops), 1);
+ Val = Chain.getValue(0);
+
+ // Round the f80 to the right size, which also moves it to the appropriate
+ // xmm register.
+ if (CopyVT != VA.getValVT())
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1));
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ CopyVT, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ }
+ InFlag = Chain.getValue(2);
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+ NotStructReturn,
+ RegStructReturn,
+ StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ if (Outs.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// ArgsAreStructReturn - Determines whether a function uses struct
+/// return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+ if (Ins.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+ ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+ DebugLoc dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ /*isVolatile*/false, /*AlwaysInline=*/true,
+ MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// IsTailCallConvention - Return true if the calling convention is one that
+/// supports tail call optimization.
+static bool IsTailCallConvention(CallingConv::ID CC) {
+ return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::HiPE);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
+ return false;
+
+ CallSite CS(CI);
+ CallingConv::ID CalleeCC = CS.getCallingConv();
+ if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+ return false;
+
+ return true;
+}
+
+/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// a tailcall target by changing its ABI.
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
+ bool GuaranteedTailCallOpt) {
+ return GuaranteedTailCallOpt && IsTailCallConvention(CC);
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+ CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo *MFI,
+ unsigned i) const {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
+ getTargetMachine().Options.GuaranteedTailCallOpt);
+ bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+ EVT ValVT;
+
+ // If value is passed by pointer we have address passed instead of the value
+ // itself.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ValVT = VA.getLocVT();
+ else
+ ValVT = VA.getValVT();
+
+ // FIXME: For now, all byval parameter objects are marked mutable. This can be
+ // changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable. Since they
+ // could be overwritten by lowering of arguments in case of a tail call.
+ if (Flags.isByVal()) {
+ unsigned Bytes = Flags.getByValSize();
+ if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+ int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+ return DAG.getFrameIndex(FI, getPointerTy());
+ } else {
+ int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+ VA.getLocMemOffset(), isImmutable);
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+ return DAG.getLoad(ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, false, 0);
+ }
+}
+
+SDValue
+X86TargetLowering::LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals)
+ const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Function* Fn = MF.getFunction();
+ if (Fn->hasExternalLinkage() &&
+ Subtarget->isTargetCygMing() &&
+ Fn->getName() == "main")
+ FuncInfo->setForceFramePointer(true);
+
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWindows = Subtarget->isTargetWindows();
+ bool IsWin64 = Subtarget->isTargetWin64();
+
+ assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ "Var args not supported with calling convention fastcc, ghc or hipe");
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64) {
+ CCInfo.AllocateStack(32, 8);
+ }
+
+ CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+
+ unsigned LastVal = ~0U;
+ SDValue ArgValue;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ (void)LastVal;
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = &X86::GR32RegClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = &X86::GR64RegClass;
+ else if (RegVT == MVT::f32)
+ RC = &X86::FR32RegClass;
+ else if (RegVT == MVT::f64)
+ RC = &X86::FR64RegClass;
+ else if (RegVT.is256BitVector())
+ RC = &X86::VR256RegClass;
+ else if (RegVT.is128BitVector())
+ RC = &X86::VR128RegClass;
+ else if (RegVT == MVT::x86mmx)
+ RC = &X86::VR64RegClass;
+ else
+ llvm_unreachable("Unknown argument type!");
+
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+
+ if (VA.isExtInLoc()) {
+ // Handle MMX values passed in XMM regs.
+ if (RegVT.isVector())
+ ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+ else
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ }
+ } else {
+ assert(VA.isMemLoc());
+ ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+ }
+
+ // If value is passed via pointer - do a load.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
+ MachinePointerInfo(), false, false, false, 0);
+
+ InVals.push_back(ArgValue);
+ }
+
+ // The x86-64 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Win32 requires us to put the sret argument to %eax as well.
+ // Save the argument into a virtual register so that we can access it
+ // from the return points.
+ if (MF.getFunction()->hasStructRetAttr() &&
+ (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy();
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // Align stack specially for tail calls.
+ if (FuncIsMadeTailCallSafe(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall)) {
+ FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
+ }
+ if (Is64Bit) {
+ unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+ // FIXME: We should really autogenerate these arrays
+ static const uint16_t GPR64ArgRegsWin64[] = {
+ X86::RCX, X86::RDX, X86::R8, X86::R9
+ };
+ static const uint16_t GPR64ArgRegs64Bit[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ static const uint16_t XMMArgRegs64Bit[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ const uint16_t *GPR64ArgRegs;
+ unsigned NumXMMRegs = 0;
+
+ if (IsWin64) {
+ // The XMM registers which might contain var arg parameters are shadowed
+ // in their paired GPR. So we only need to save the GPR to their home
+ // slots.
+ TotalNumIntRegs = 4;
+ GPR64ArgRegs = GPR64ArgRegsWin64;
+ } else {
+ TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+ GPR64ArgRegs = GPR64ArgRegs64Bit;
+
+ NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
+ TotalNumXMMRegs);
+ }
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+ TotalNumIntRegs);
+
+ bool NoImplicitFloatOps = Fn->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+ assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+ assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
+ NoImplicitFloatOps) &&
+ "SSE register cannot be used when SSE is disabled!");
+ if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+ !Subtarget->hasSSE1())
+ // Kernel mode asks for SSE to be disabled, so don't push them
+ // on the stack.
+ TotalNumXMMRegs = 0;
+
+ if (IsWin64) {
+ const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
+ MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+ } else {
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by deferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(
+ MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
+ false));
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(Offset));
+ unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+ &X86::GR64RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ FuncInfo->getRegSaveFrameIndex(), Offset),
+ false, false, 0);
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 11> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+
+ unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+ SaveXMMOps.push_back(ALVal);
+
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex()));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset()));
+
+ for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+ unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
+ &X86::VR128RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
+ SaveXMMOps.push_back(Val);
+ }
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other,
+ &SaveXMMOps[0], SaveXMMOps.size()));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOps[0], MemOps.size());
+ }
+ }
+
+ // Some CCs need callee pop.
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else {
+ FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+ argsAreStructReturn(Ins) == StackStructReturn)
+ FuncInfo->setBytesToPopOnReturn(4);
+ }
+
+ if (!Is64Bit) {
+ // RegSaveFrameIndex is X86-64 only.
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+ if (CallConv == CallingConv::X86_FastCall ||
+ CallConv == CallingConv::X86_ThisCall)
+ // fastcc functions can't have varargs.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ }
+
+ FuncInfo->setArgumentStackSize(StackSize);
+
+ return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+ SDValue StackPtr, SDValue Arg,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+ if (Flags.isByVal())
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+ return DAG.getStore(Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(LocMemOffset),
+ false, false, 0);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+ SDValue &OutRetAddr, SDValue Chain,
+ bool IsTailCall, bool Is64Bit,
+ int FPDiff, DebugLoc dl) const {
+ // Adjust the Return address stack slot.
+ EVT VT = getPointerTy();
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
+ false, false, false, 0);
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue
+EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
+ unsigned SlotSize, int FPDiff, DebugLoc dl) {
+ // Store the return address to the appropriate stack slot.
+ if (!FPDiff) return Chain;
+ // Calculate the new stack slot for the return address.
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+ Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+ MachinePointerInfo::getFixedStack(NewReturnAddrFI),
+ false, false, 0);
+ return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ DebugLoc &dl = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &isTailCall = CLI.IsTailCall;
+ bool isVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isTargetWin64();
+ bool IsWindows = Subtarget->isTargetWindows();
+ StructReturnType SR = callIsStructReturn(Outs);
+ bool IsSibcall = false;
+
+ if (MF.getTarget().Options.DisableTailCalls)
+ isTailCall = false;
+
+ if (isTailCall) {
+ // Check if it's really possible to do a tail call.
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ isVarArg, SR != NotStructReturn,
+ MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+ Outs, OutVals, Ins, DAG);
+
+ // Sibcalls are automatically detected tailcalls which do not require
+ // ABI changes.
+ if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+ IsSibcall = true;
+
+ if (isTailCall)
+ ++NumTailCalls;
+ }
+
+ assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ "Var args not supported with calling convention fastcc, ghc or hipe");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+ ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64) {
+ CCInfo.AllocateStack(32, 8);
+ }
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (IsSibcall)
+ // This is a sibcall. The memory operands are available in caller's
+ // own caller's stack.
+ NumBytes = 0;
+ else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+ IsTailCallConvention(CallConv))
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+ int FPDiff = 0;
+ if (isTailCall && !IsSibcall) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < X86Info->getTCReturnAddrDelta())
+ X86Info->setTCReturnAddrDelta(FPDiff);
+ }
+
+ if (!IsSibcall)
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+ SDValue RetAddrFrIdx;
+ // Load return address for tail calls.
+ if (isTailCall && FPDiff)
+ Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+ Is64Bit, FPDiff, dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ if (RegVT.is128BitVector()) {
+ // Special case: passing MMX values in XMM registers.
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+ Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+ Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+ } else
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
+ break;
+ case CCValAssign::Indirect: {
+ // Store the argument.
+ SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, 0);
+ Arg = SpillSlot;
+ break;
+ }
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (isVarArg && IsWin64) {
+ // Win64 ABI requires argument XMM reg to be copied to the corresponding
+ // shadow reg if callee is a varargs function.
+ unsigned ShadowReg = 0;
+ switch (VA.getLocReg()) {
+ case X86::XMM0: ShadowReg = X86::RCX; break;
+ case X86::XMM1: ShadowReg = X86::RDX; break;
+ case X86::XMM2: ShadowReg = X86::R8; break;
+ case X86::XMM3: ShadowReg = X86::R9; break;
+ }
+ if (ShadowReg)
+ RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+ }
+ } else if (!IsSibcall && (!isTailCall || isByVal)) {
+ assert(VA.isMemLoc());
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy());
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ if (Subtarget->isPICStyleGOT()) {
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (!isTailCall) {
+ RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
+ } else {
+ // If we are tail calling and generating PIC/GOT style code load the
+ // address of the callee into ECX. The value in ecx is used as target of
+ // the tail jump. This is done to circumvent the ebx/callee-saved problem
+ // for tail calls on PIC/GOT architectures. Normally we would just put the
+ // address of GOT into ebx and then call target@PLT. But for tail calls
+ // ebx would be restored (since ebx is callee saved) before jumping to the
+ // target@PLT.
+
+ // Note: The actual moving to ECX is done further down.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (G && !G->getGlobal()->hasHiddenVisibility() &&
+ !G->getGlobal()->hasProtectedVisibility())
+ Callee = LowerGlobalAddress(Callee, DAG);
+ else if (isa<ExternalSymbolSDNode>(Callee))
+ Callee = LowerExternalSymbol(Callee, DAG);
+ }
+ }
+
+ if (Is64Bit && isVarArg && !IsWin64) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const uint16_t XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+
+ RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ DAG.getConstant(NumXMMRegs, MVT::i8)));
+ }
+
+ // For tail calls lower the arguments to the 'real' stack slot.
+ if (isTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, dl,
+ RegInfo->getStackRegister(),
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, 0));
+ }
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOpChains2[0], MemOpChains2.size());
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+ getPointerTy(), RegInfo->getSlotSize(),
+ FPDiff, dl);
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+ // In the 64-bit large code model, we have to make all calls
+ // through a register, since the call instruction's 32-bit
+ // pc-relative offset may not be large enough to hold the whole
+ // address.
+ } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // If the callee is a GlobalAddress node (quite common, every direct call
+ // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+ // it.
+
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ const GlobalValue *GV = G->getGlobal();
+ if (!GV->hasDLLImportLinkage()) {
+ unsigned char OpFlags = 0;
+ bool ExtraLoad = false;
+ unsigned WrapperKind = ISD::DELETED_NODE;
+
+ // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+ // external symbols most go through the PLT in PIC mode. If the symbol
+ // has hidden or protected visibility, or if it is static or local, then
+ // we don't need to use the PLT - we can directly call it.
+ if (Subtarget->isTargetELF() &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ (GV->isDeclaration() || GV->isWeakForLinker()) &&
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ } else if (Subtarget->isPICStyleRIPRel() &&
+ isa<Function>(GV) &&
+ cast<Function>(GV)->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NonLazyBind)) {
+ // If the function is marked as non-lazy, generate an indirect call
+ // which loads from the GOT directly. This avoids runtime overhead
+ // at the cost of eager binding (and one extra byte of encoding).
+ OpFlags = X86II::MO_GOTPCREL;
+ WrapperKind = X86ISD::WrapperRIP;
+ ExtraLoad = true;
+ }
+
+ Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
+ G->getOffset(), OpFlags);
+
+ // Add a wrapper if needed.
+ if (WrapperKind != ISD::DELETED_NODE)
+ Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+ // Add extra indirection if needed.
+ if (ExtraLoad)
+ Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(),
+ false, false, false, 0);
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
+ // external symbols should go through the PLT.
+ if (Subtarget->isTargetELF() &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+ OpFlags);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+
+ if (!IsSibcall && isTailCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(0, true), InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (isTailCall)
+ Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (isTailCall) {
+ // We used to do:
+ //// If this is the first return lowered for this function, add the regs
+ //// to the liveout set for the function.
+ // This isn't right, although it's probably harmless on x86; liveouts
+ // should be computed from returns not tail calls. Consider a void
+ // function making a tail call to a function returning int.
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+ }
+
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPush;
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ getTargetMachine().Options.GuaranteedTailCallOpt))
+ NumBytesForCalleeToPush = NumBytes; // Callee pops everything
+ else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+ SR == StackStructReturn)
+ // If this is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+ NumBytesForCalleeToPush = 4;
+ else
+ NumBytesForCalleeToPush = 0; // Callee pops nothing.
+
+ // Returns a flag for retval copy to use.
+ if (!IsSibcall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+ true),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+ Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On X86_64 architecture with GOT-style position independent code only local
+// (within module) calls are supported at the moment.
+// To keep the stack aligned according to platform abi the function
+// GetAlignedArgumentStackSize ensures that argument delta is always multiples
+// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achieved by reserving an area the size of the argument delta right after the
+// original REtADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameLowering &TFI = *TM.getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = RegInfo->getSlotSize();
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ return Offset;
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+ MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+ const X86InstrInfo *TII) {
+ unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(VR))
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(Def, FI))
+ return false;
+ } else {
+ unsigned Opcode = Def->getOpcode();
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ Def->getOperand(1).isFI()) {
+ FI = Def->getOperand(1).getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+ }
+ } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+ FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+ FI = FINode->getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI->isFixedObjectIndex(FI))
+ return false;
+ return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG &DAG) const {
+ if (!IsTailCallConvention(CalleeCC) &&
+ CalleeCC != CallingConv::C)
+ return false;
+
+ // If -tailcallopt is specified, make fastcc functions tail-callable.
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = DAG.getMachineFunction().getFunction();
+
+ // If the function return type is x86_fp80 and the callee return type is not,
+ // then the FP_EXTEND of the call result is not a nop. It's not safe to
+ // perform a tailcall optimization here.
+ if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+ return false;
+
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+
+ if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+ if (IsTailCallConvention(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes. This is what gcc calls sibcall.
+
+ // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+ // emit a special epilogue.
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // An stdcall caller is expected to clean up its arguments; the callee
+ // isn't going to do that.
+ if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
+ return false;
+
+ // Do not sibcall optimize vararg calls unless all arguments are passed via
+ // registers.
+ if (isVarArg && !Outs.empty()) {
+
+ // Optimizing for varargs on Win64 is unlikely to be safe without
+ // additional testing.
+ if (Subtarget->isTargetWin64())
+ return false;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
+ // If the call result is in ST0 / ST1, it needs to be popped off the x87
+ // stack. Therefore, if it's not used by the call it is not safe to optimize
+ // this into a sibcall.
+ bool Unused = false;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ if (!Ins[i].Used) {
+ Unused = true;
+ break;
+ }
+ }
+ if (Unused) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+ return false;
+ }
+ }
+
+ // If the calling conventions do not match, then we'd better make sure the
+ // results are returned in the same way as what the caller expects.
+ if (!CCMatch) {
+ SmallVector<CCValAssign, 16> RVLocs1;
+ CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs1, *DAG.getContext());
+ CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
+
+ SmallVector<CCValAssign, 16> RVLocs2;
+ CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs2, *DAG.getContext());
+ CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
+
+ if (RVLocs1.size() != RVLocs2.size())
+ return false;
+ for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+ if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+ return false;
+ if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+ return false;
+ if (RVLocs1[i].isRegLoc()) {
+ if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+ return false;
+ } else {
+ if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+ return false;
+ }
+ }
+ }
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64
+ if (Subtarget->isTargetWin64()) {
+ CCInfo.AllocateStack(32, 8);
+ }
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ if (CCInfo.getNextStackOffset()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
+ return false;
+
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const X86InstrInfo *TII =
+ ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+ MFI, MRI, TII))
+ return false;
+ }
+ }
+ }
+
+ // If the tailcall address may be in a register, then make sure it's
+ // possible to register allocate for it. In 32-bit, the call address can
+ // only target EAX, EDX, or ECX since the tail call must be scheduled after
+ // callee-saved registers are restored. These happen to be the same
+ // registers used to pass 'inreg' arguments so watch out for those.
+ if (!Subtarget->is64Bit() &&
+ ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
+ unsigned NumInRegs = 0;
+ // In PIC we need an extra register to formulate the address computation
+ // for the callee.
+ unsigned MaxInRegs =
+ (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc())
+ continue;
+ unsigned Reg = VA.getLocReg();
+ switch (Reg) {
+ default: break;
+ case X86::EAX: case X86::EDX: case X86::ECX:
+ if (++NumInRegs == MaxInRegs)
+ return false;
+ break;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+ switch(Opcode) {
+ default: return false;
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
+ case X86ISD::PALIGNR:
+ case X86ISD::MOVLHPS:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ case X86ISD::VPERMILP:
+ case X86ISD::VPERM2X128:
+ case X86ISD::VPERMI:
+ return true;
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+ SDValue V1, SelectionDAG &DAG) {
+ switch(Opc) {
+ default: llvm_unreachable("Unknown x86 shuffle node");
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ return DAG.getNode(Opc, dl, VT, V1);
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+ SDValue V1, unsigned TargetMask,
+ SelectionDAG &DAG) {
+ switch(Opc) {
+ default: llvm_unreachable("Unknown x86 shuffle node");
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::VPERMILP:
+ case X86ISD::VPERMI:
+ return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+ SDValue V1, SDValue V2, unsigned TargetMask,
+ SelectionDAG &DAG) {
+ switch(Opc) {
+ default: llvm_unreachable("Unknown x86 shuffle node");
+ case X86ISD::PALIGNR:
+ case X86ISD::SHUFP:
+ case X86ISD::VPERM2X128:
+ return DAG.getNode(Opc, dl, VT, V1, V2,
+ DAG.getConstant(TargetMask, MVT::i8));
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ switch(Opc) {
+ default: llvm_unreachable("Unknown x86 shuffle node");
+ case X86ISD::MOVLHPS:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ return DAG.getNode(Opc, dl, VT, V1, V2);
+ }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+ false);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement) {
+ // Offset should fit into 32 bit immediate field.
+ if (!isInt<32>(Offset))
+ return false;
+
+ // If we don't have a symbolic displacement - we don't have any extra
+ // restrictions.
+ if (!hasSymbolicDisplacement)
+ return true;
+
+ // FIXME: Some tweaks might be needed for medium code model.
+ if (M != CodeModel::Small && M != CodeModel::Kernel)
+ return false;
+
+ // For small code model we assume that latest object is 16MB before end of 31
+ // bits boundary. We may also accept pretty large negative constants knowing
+ // that all objects are in the positive half of address space.
+ if (M == CodeModel::Small && Offset < 16*1024*1024)
+ return true;
+
+ // For kernel code model we know that all object resist in the negative half
+ // of 32bits address space. We may not accept negative offsets, since they may
+ // be just off and we may accept pretty large positive ones.
+ if (M == CodeModel::Kernel && Offset > 0)
+ return true;
+
+ return false;
+}
+
+/// isCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+ if (IsVarArg)
+ return false;
+
+ switch (CallingConv) {
+ default:
+ return false;
+ case CallingConv::X86_StdCall:
+ return !is64Bit;
+ case CallingConv::X86_FastCall:
+ return !is64Bit;
+ case CallingConv::X86_ThisCall:
+ return !is64Bit;
+ case CallingConv::Fast:
+ return TailCallOpt;
+ case CallingConv::GHC:
+ return TailCallOpt;
+ case CallingConv::HiPE:
+ return TailCallOpt;
+ }
+}
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+ SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return X86::COND_S;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ return X86::COND_LE;
+ }
+ }
+
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+ !ISD::isNON_EXTLoad(RHS.getNode())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT: return X86::COND_A;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE: return X86::COND_AE;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT: return X86::COND_B;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE: return X86::COND_BE;
+ case ISD::SETONE:
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETUO: return X86::COND_P;
+ case ISD::SETO: return X86::COND_NP;
+ case ISD::SETOEQ:
+ case ISD::SETUNE: return X86::COND_INVALID;
+ }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+ if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+ return true;
+ }
+ return false;
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+ return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ return (Val < 0 || Val == CmpVal);
+}
+
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+ unsigned Pos, unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ if (!isUndefOrEqual(Mask[i], Low))
+ return false;
+ return true;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
+ if (VT == MVT::v4f32 || VT == MVT::v4i32 )
+ return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+ if (VT == MVT::v2f64 || VT == MVT::v2i64)
+ return (Mask[0] < 2 && Mask[1] < 2);
+ return false;
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+ if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
+ return false;
+
+ // Lower quadword copied in order or undef.
+ if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
+ return false;
+
+ // Upper quadword shuffled.
+ for (unsigned i = 4; i != 8; ++i)
+ if (!isUndefOrInRange(Mask[i], 4, 8))
+ return false;
+
+ if (VT == MVT::v16i16) {
+ // Lower quadword copied in order or undef.
+ if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+ return false;
+
+ // Upper quadword shuffled.
+ for (unsigned i = 12; i != 16; ++i)
+ if (!isUndefOrInRange(Mask[i], 12, 16))
+ return false;
+ }
+
+ return true;
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+ if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
+ return false;
+
+ // Upper quadword copied in order.
+ if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
+ return false;
+
+ // Lower quadword shuffled.
+ for (unsigned i = 0; i != 4; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, 4))
+ return false;
+
+ if (VT == MVT::v16i16) {
+ // Upper quadword copied in order.
+ if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
+ return false;
+
+ // Lower quadword shuffled.
+ for (unsigned i = 8; i != 12; ++i)
+ if (!isUndefOrInRange(Mask[i], 8, 12))
+ return false;
+ }
+
+ return true;
+}
+
+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+ (VT.is256BitVector() && !Subtarget->hasInt256()))
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ // Do not handle 64-bit element shuffles with palignr.
+ if (NumLaneElts == 2)
+ return false;
+
+ for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
+ unsigned i;
+ for (i = 0; i != NumLaneElts; ++i) {
+ if (Mask[i+l] >= 0)
+ break;
+ }
+
+ // Lane is all undef, go to next lane
+ if (i == NumLaneElts)
+ continue;
+
+ int Start = Mask[i+l];
+
+ // Make sure its in this lane in one of the sources
+ if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
+ !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
+ return false;
+
+ // If not lane 0, then we must match lane 0
+ if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
+ return false;
+
+ // Correct second source to be contiguous with first source
+ if (Start >= (int)NumElts)
+ Start -= NumElts - NumLaneElts;
+
+ // Make sure we're shifting in the right direction.
+ if (Start <= (int)(i+l))
+ return false;
+
+ Start -= i;
+
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != NumLaneElts; ++i) {
+ int Idx = Mask[i+l];
+
+ // Make sure its in this lane
+ if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
+ !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
+ return false;
+
+ // If not lane 0, then we must match lane 0
+ if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
+ return false;
+
+ if (Idx >= (int)NumElts)
+ Idx -= NumElts - NumLaneElts;
+
+ if (!isUndefOrEqual(Idx, Start+i))
+ return false;
+
+ }
+ }
+
+ return true;
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+ unsigned NumElems) {
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int idx = Mask[i];
+ if (idx < 0)
+ continue;
+ else if (idx < (int)NumElems)
+ Mask[i] = idx + NumElems;
+ else
+ Mask[i] = idx - NumElems;
+ }
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128/256-bit
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
+ bool Commuted = false) {
+ if (!HasFp256 && VT.is256BitVector())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElems = NumElems/NumLanes;
+
+ if (NumLaneElems != 2 && NumLaneElems != 4)
+ return false;
+
+ // VSHUFPSY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
+ // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
+ //
+ // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
+ // Y3..Y0, Y3..Y0, X3..X0, X3..X0
+ //
+ // VSHUFPDY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X3 X2 X1 X0
+ // SRC2 => Y3 Y2 Y1 Y0
+ //
+ // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
+ //
+ unsigned HalfLaneElems = NumLaneElems/2;
+ for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ int Idx = Mask[i+l];
+ unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
+ if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
+ return false;
+ // For VSHUFPSY, the mask of the second half must be the same as the
+ // first but with the appropriate offsets. This works in the same way as
+ // VPERMILPS works with masks.
+ if (NumElems != 8 || l == 0 || Mask[i] < 0)
+ continue;
+ if (!isUndefOrEqual(Idx, Mask[i]+l))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 4)
+ return false;
+
+ // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+ return isUndefOrEqual(Mask[0], 6) &&
+ isUndefOrEqual(Mask[1], 7) &&
+ isUndefOrEqual(Mask[2], 2) &&
+ isUndefOrEqual(Mask[3], 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 4)
+ return false;
+
+ return isUndefOrEqual(Mask[0], 2) &&
+ isUndefOrEqual(Mask[1], 3) &&
+ isUndefOrEqual(Mask[2], 2) &&
+ isUndefOrEqual(Mask[3], 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i + NumElems))
+ return false;
+
+ for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+
+ return true;
+}
+
+/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
+static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i + e], i + NumElems))
+ return false;
+
+ return true;
+}
+
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ DebugLoc dl = SVOp->getDebugLoc();
+
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return SDValue();
+
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ // These are the special masks that may be optimized.
+ static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ bool MatchEvenMask = true;
+ bool MatchOddMask = true;
+ for (int i=0; i<8; ++i) {
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+ MatchEvenMask = false;
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+ MatchOddMask = false;
+ }
+
+ if (!MatchEvenMask && !MatchOddMask)
+ return SDValue();
+
+ SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+ SDValue Op0 = SVOp->getOperand(0);
+ SDValue Op1 = SVOp->getOperand(1);
+
+ if (MatchEvenMask) {
+ // Shift the second operand right to 32 bits.
+ static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+ Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+ } else {
+ // Shift the first operand left to 32 bits.
+ static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+ Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+ }
+ static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+ return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
+ bool HasInt256, bool V2IsSplat = false) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+ (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+ return false;
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+ i != (l+1)*NumLaneElts;
+ i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (V2IsSplat) {
+ if (!isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts))
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
+ bool HasInt256, bool V2IsSplat = false) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+ (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+ return false;
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != (l+1)*NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (V2IsSplat) {
+ if (isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j+NumElts))
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+ unsigned NumElts = VT.getVectorNumElements();
+ bool Is256BitVec = VT.is256BitVector();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
+ (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+ return false;
+
+ // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+ // FIXME: Need a better way to get rid of this, there's no latency difference
+ // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+ // the former later. We should also remove the "_undef" special mask.
+ if (NumElts == 4 && Is256BitVec)
+ return false;
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+ i != (l+1)*NumLaneElts;
+ i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+ (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+ return false;
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != (l+1)*NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+ }
+ return true;
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
+ if (VT.getVectorElementType().getSizeInBits() < 32)
+ return false;
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (!isUndefOrEqual(Mask[0], NumElts))
+ return false;
+
+ for (unsigned i = 1; i != NumElts; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+
+ return true;
+}
+
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+ if (!HasFp256 || !VT.is256BitVector())
+ return false;
+
+ // The shuffle result is divided into half A and half B. In total the two
+ // sources have 4 halves, namely: C, D, E, F. The final values of A and
+ // B must come from C, D, E or F.
+ unsigned HalfSize = VT.getVectorNumElements()/2;
+ bool MatchA = false, MatchB = false;
+
+ // Check if A comes from one of C, D, E, F.
+ for (unsigned Half = 0; Half != 4; ++Half) {
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+ MatchA = true;
+ break;
+ }
+ }
+
+ // Check if B comes from one of C, D, E, F.
+ for (unsigned Half = 0; Half != 4; ++Half) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+ MatchB = true;
+ break;
+ }
+ }
+
+ return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+
+ unsigned HalfSize = VT.getVectorNumElements()/2;
+
+ unsigned FstHalf = 0, SndHalf = 0;
+ for (unsigned i = 0; i < HalfSize; ++i) {
+ if (SVOp->getMaskElt(i) > 0) {
+ FstHalf = SVOp->getMaskElt(i)/HalfSize;
+ break;
+ }
+ }
+ for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
+ if (SVOp->getMaskElt(i) > 0) {
+ SndHalf = SVOp->getMaskElt(i)/HalfSize;
+ break;
+ }
+ }
+
+ return (FstHalf | (SndHalf << 4));
+}
+
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
+static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+ if (!HasFp256)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ // Only match 256-bit with 32/64-bit types
+ if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
+ return false;
+
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned LaneSize = NumElts/NumLanes;
+ for (unsigned l = 0; l != NumElts; l += LaneSize) {
+ for (unsigned i = 0; i != LaneSize; ++i) {
+ if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+ return false;
+ if (NumElts != 8 || l == 0)
+ continue;
+ // VPERMILPS handling
+ if (Mask[i] < 0)
+ continue;
+ if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
+ bool V2IsSplat = false, bool V2IsUndef = false) {
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned NumOps = VT.getVectorNumElements();
+ if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+ return false;
+
+ if (!isUndefOrEqual(Mask[0], 0))
+ return false;
+
+ for (unsigned i = 1; i != NumOps; ++i)
+ if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+ (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+ (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+ return false;
+
+ return true;
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasSSE3())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if ((VT.is128BitVector() && NumElems != 4) ||
+ (VT.is256BitVector() && NumElems != 8))
+ return false;
+
+ // "i+1" is the value the indexed mask element must have
+ for (unsigned i = 0; i != NumElems; i += 2)
+ if (!isUndefOrEqual(Mask[i], i+1) ||
+ !isUndefOrEqual(Mask[i+1], i+1))
+ return false;
+
+ return true;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasSSE3())
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if ((VT.is128BitVector() && NumElems != 4) ||
+ (VT.is256BitVector() && NumElems != 8))
+ return false;
+
+ // "i" is the value the indexed mask element must have
+ for (unsigned i = 0; i != NumElems; i += 2)
+ if (!isUndefOrEqual(Mask[i], i) ||
+ !isUndefOrEqual(Mask[i+1], i))
+ return false;
+
+ return true;
+}
+
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+ if (!HasFp256 || !VT.is256BitVector())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts != 4)
+ return false;
+
+ for (unsigned i = 0; i != NumElts/2; ++i)
+ if (!isUndefOrEqual(Mask[i], 0))
+ return false;
+ for (unsigned i = NumElts/2; i != NumElts; ++i)
+ if (!isUndefOrEqual(Mask[i], NumElts/2))
+ return false;
+ return true;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
+ unsigned e = VT.getVectorNumElements() / 2;
+ for (unsigned i = 0; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+ for (unsigned i = 0; i != e; ++i)
+ if (!isUndefOrEqual(Mask[e+i], i))
+ return false;
+ return true;
+}
+
+/// isVEXTRACTF128Index - Return true if the specified
+/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+/// suitable for input to VEXTRACTF128.
+bool X86::isVEXTRACTF128Index(SDNode *N) {
+ if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+ return false;
+
+ // The index should be aligned on a 128-bit boundary.
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+ MVT VT = N->getValueType(0).getSimpleVT();
+ unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+ bool Result = (Index * ElSize) % 128 == 0;
+
+ return Result;
+}
+
+/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// VINSERTF128.
+bool X86::isVINSERTF128Index(SDNode *N) {
+ if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+ return false;
+
+ // The index should be aligned on a 128-bit boundary.
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+ MVT VT = N->getValueType(0).getSimpleVT();
+ unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+ bool Result = (Index * ElSize) % 128 == 0;
+
+ return Result;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
+/// Handles 128-bit and 256-bit.
+static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
+ MVT VT = N->getValueType(0).getSimpleVT();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for PSHUF/SHUFP");
+
+ // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
+ // independently on 128-bit lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ assert((NumLaneElts == 2 || NumLaneElts == 4) &&
+ "Only supports 2 or 4 elements per lane");
+
+ unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
+ unsigned Mask = 0;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt < 0) continue;
+ Elt &= NumLaneElts - 1;
+ unsigned ShAmt = (i << Shift) % 8;
+ Mask |= Elt << ShAmt;
+ }
+
+ return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
+static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+ MVT VT = N->getValueType(0).getSimpleVT();
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+ "Unsupported vector type for PSHUFHW");
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned Mask = 0;
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ // 8 nodes per lane, but we only care about the last 4.
+ for (unsigned i = 0; i < 4; ++i) {
+ int Elt = N->getMaskElt(l+i+4);
+ if (Elt < 0) continue;
+ Elt &= 0x3; // only 2-bits.
+ Mask |= Elt << (i * 2);
+ }
+ }
+
+ return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
+static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+ MVT VT = N->getValueType(0).getSimpleVT();
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+ "Unsupported vector type for PSHUFHW");
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned Mask = 0;
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ // 8 nodes per lane, but we only care about the first 4.
+ for (unsigned i = 0; i < 4; ++i) {
+ int Elt = N->getMaskElt(l+i);
+ if (Elt < 0) continue;
+ Elt &= 0x3; // only 2-bits
+ Mask |= Elt << (i * 2);
+ }
+ }
+
+ return Mask;
+}
+
+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ int Val = 0;
+ unsigned i;
+ for (i = 0; i != NumElts; ++i) {
+ Val = SVOp->getMaskElt(i);
+ if (Val >= 0)
+ break;
+ }
+ if (Val >= (int)NumElts)
+ Val -= NumElts - NumLaneElts;
+
+ assert(Val - i > 0 && "PALIGNR imm should be positive");
+ return (Val - i) * EltSize;
+}
+
+/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
+/// instructions.
+unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
+ if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+ llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
+
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+ MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+ MVT ElVT = VecVT.getVectorElementType();
+
+ unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+ return Index / NumElemsPerChunk;
+}
+
+/// getInsertVINSERTF128Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
+/// instructions.
+unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
+ if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+ llvm_unreachable("Illegal insert subvector for VINSERTF128");
+
+ uint64_t Index =
+ cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+ MVT VecVT = N->getValueType(0).getSimpleVT();
+ MVT ElVT = VecVT.getVectorElementType();
+
+ unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+ return Index / NumElemsPerChunk;
+}
+
+/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
+/// Handles 256-bit.
+static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
+ MVT VT = N->getValueType(0).getSimpleVT();
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is256BitVector() && NumElts == 4) &&
+ "Unsupported vector type for VPERMQ/VPERMPD");
+
+ unsigned Mask = 0;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt < 0)
+ continue;
+ Mask |= Elt << (i*2);
+ }
+
+ return Mask;
+}
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
+ return CN->isNullValue();
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
+ return CFP->getValueAPF().isPosZero();
+ return false;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
+/// their permute mask.
+static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> MaskVec;
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int Idx = SVOp->getMaskElt(i);
+ if (Idx >= 0) {
+ if (Idx < (int)NumElems)
+ Idx += NumElems;
+ else
+ Idx -= NumElems;
+ }
+ MaskVec.push_back(Idx);
+ }
+ return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+ SVOp->getOperand(0), &MaskVec[0]);
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+ if (VT.getVectorNumElements() != 4)
+ return false;
+ for (unsigned i = 0, e = 2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i+2))
+ return false;
+ for (unsigned i = 2; i != 4; ++i)
+ if (!isUndefOrEqual(Mask[i], i+4))
+ return false;
+ return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+ if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+ N = N->getOperand(0).getNode();
+ if (!ISD::isNON_EXTLoad(N))
+ return false;
+ if (LD)
+ *LD = cast<LoadSDNode>(N);
+ return true;
+}
+
+// Test whether the given value is a vector value which will be legalized
+// into a load.
+static bool WillBeConstantPoolLoad(SDNode *N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ // Check for any non-constant elements.
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+ switch (N->getOperand(i).getNode()->getOpcode()) {
+ case ISD::UNDEF:
+ case ISD::ConstantFP:
+ case ISD::Constant:
+ break;
+ default:
+ return false;
+ }
+
+ // Vectors of all-zeros and all-ones are materialized with special
+ // instructions rather than being loaded.
+ return !ISD::isBuildVectorAllZeros(N) &&
+ !ISD::isBuildVectorAllOnes(N);
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+ ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
+ if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+ return false;
+ // Is V2 is a vector load, don't do this transformation. We will try to use
+ // load folding shufps op.
+ if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+ for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i+NumElems))
+ return false;
+ return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ SDValue SplatValue = N->getOperand(0);
+ for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+ if (N->getOperand(i) != SplatValue)
+ return false;
+ return true;
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int Idx = N->getMaskElt(i);
+ if (Idx >= (int)NumElems) {
+ unsigned Opc = V2.getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
+ return false;
+ } else if (Idx >= 0) {
+ unsigned Opc = V1.getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !X86::isZeroNode(V1.getOperand(Idx)))
+ return false;
+ }
+ }
+ return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, DebugLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ // Always build SSE zero vectors as <4 x i32> bitcasted
+ // to their dest type. This ensures they get CSE'd.
+ SDValue Vec;
+ if (VT.is128BitVector()) { // SSE
+ if (Subtarget->hasSSE2()) { // SSE2
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ } else { // SSE1
+ SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+ }
+ } else if (VT.is256BitVector()) { // AVX
+ if (Subtarget->hasInt256()) { // AVX2
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
+ array_lengthof(Ops));
+ } else {
+ // 256-bit logic and arithmetic instructions in AVX are all
+ // floating-point, no support for integer ops. Emit fp zeroed vectors.
+ SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
+ array_lengthof(Ops));
+ }
+ } else
+ llvm_unreachable("Unexpected vector type");
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
+ DebugLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDValue Vec;
+ if (VT.is256BitVector()) {
+ if (HasInt256) { // AVX2
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
+ array_lengthof(Ops));
+ } else { // AVX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+ }
+ } else if (VT.is128BitVector()) {
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ } else
+ llvm_unreachable("Unexpected vector type");
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+}
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
+ for (unsigned i = 0; i != NumElems; ++i) {
+ if (Mask[i] > (int)NumElems) {
+ Mask[i] = NumElems;
+ }
+ }
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+ Mask.push_back(i);
+ Mask.push_back(i + NumElems);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
+ Mask.push_back(i + Half);
+ Mask.push_back(i + NumElems + Half);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+ EVT VT = V.getValueType();
+ int NumElems = VT.getVectorNumElements();
+ DebugLoc dl = V.getDebugLoc();
+
+ while (NumElems > 4) {
+ if (EltNo < NumElems/2) {
+ V = getUnpackl(DAG, dl, VT, V, V);
+ } else {
+ V = getUnpackh(DAG, dl, VT, V, V);
+ EltNo -= NumElems/2;
+ }
+ NumElems >>= 1;
+ }
+ return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+ EVT VT = V.getValueType();
+ DebugLoc dl = V.getDebugLoc();
+
+ if (VT.is128BitVector()) {
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
+ int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+ V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+ &SplatMask[0]);
+ } else if (VT.is256BitVector()) {
+ // To use VPERMILPS to splat scalars, the second half of indicies must
+ // refer to the higher part, which is a duplication of the lower one,
+ // because VPERMILPS can only handle in-lane permutations.
+ int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+ EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+ V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+ &SplatMask[0]);
+ } else
+ llvm_unreachable("Vector size not supported");
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+ EVT SrcVT = SV->getValueType(0);
+ SDValue V1 = SV->getOperand(0);
+ DebugLoc dl = SV->getDebugLoc();
+
+ int EltNo = SV->getSplatIndex();
+ int NumElems = SrcVT.getVectorNumElements();
+ bool Is256BitVec = SrcVT.is256BitVector();
+
+ assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
+ "Unknown how to promote splat for type");
+
+ // Extract the 128-bit part containing the splat element and update
+ // the splat element index when it refers to the higher register.
+ if (Is256BitVec) {
+ V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+ if (EltNo >= NumElems/2)
+ EltNo -= NumElems/2;
+ }
+
+ // All i16 and i8 vector types can't be used directly by a generic shuffle
+ // instruction because the target has no such instruction. Generate shuffles
+ // which repeat i16 and i8 several times until they fit in i32, and then can
+ // be manipulated by target suported shuffles.
+ EVT EltVT = SrcVT.getVectorElementType();
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ V1 = PromoteSplati8i16(V1, DAG, EltNo);
+
+ // Recreate the 256-bit vector and place the same 128-bit vector
+ // into the low and high part. This is necessary because we want
+ // to use VPERM* to shuffle the vectors
+ if (Is256BitVec) {
+ V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
+ }
+
+ return getLegalSplat(DAG, V1, EltNo);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector. This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+ bool IsZero,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ EVT VT = V2.getValueType();
+ SDValue V1 = IsZero
+ ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec;
+ for (unsigned i = 0; i != NumElems; ++i)
+ // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec.push_back(i == Idx ? NumElems : i);
+ return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+}
+
+/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
+/// target specific opcode. Returns true if the Mask could be calculated.
+/// Sets IsUnary to true if only uses one source.
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
+ SmallVectorImpl<int> &Mask, bool &IsUnary) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue ImmN;
+
+ IsUnary = false;
+ switch(N->getOpcode()) {
+ case X86ISD::SHUFP:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
+ case X86ISD::UNPCKH:
+ DecodeUNPCKHMask(VT, Mask);
+ break;
+ case X86ISD::UNPCKL:
+ DecodeUNPCKLMask(VT, Mask);
+ break;
+ case X86ISD::MOVHLPS:
+ DecodeMOVHLPSMask(NumElems, Mask);
+ break;
+ case X86ISD::MOVLHPS:
+ DecodeMOVLHPSMask(NumElems, Mask);
+ break;
+ case X86ISD::PALIGNR:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMILP:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFHW:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFLW:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VPERMI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD: {
+ // The index 0 always comes from the first element of the second source,
+ // this is why MOVSS and MOVSD are used in the first place. The other
+ // elements come from the other positions of the first source vector
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i) {
+ Mask.push_back(i);
+ }
+ break;
+ }
+ case X86ISD::VPERM2X128:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ if (Mask.empty()) return false;
+ break;
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ // Not yet implemented
+ return false;
+ default: llvm_unreachable("unknown target shuffle node");
+ }
+
+ return true;
+}
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+ unsigned Depth) {
+ if (Depth == 6)
+ return SDValue(); // Limit search depth.
+
+ SDValue V = SDValue(N, 0);
+ EVT VT = V.getValueType();
+ unsigned Opcode = V.getOpcode();
+
+ // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+ if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+ int Elt = SV->getMaskElt(Index);
+
+ if (Elt < 0)
+ return DAG.getUNDEF(VT.getVectorElementType());
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+ : SV->getOperand(1);
+ return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+ }
+
+ // Recurse into target specific vector shuffles to find scalars.
+ if (isTargetShuffle(Opcode)) {
+ MVT ShufVT = V.getValueType().getSimpleVT();
+ unsigned NumElems = ShufVT.getVectorNumElements();
+ SmallVector<int, 16> ShuffleMask;
+ bool IsUnary;
+
+ if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+ return SDValue();
+
+ int Elt = ShuffleMask[Index];
+ if (Elt < 0)
+ return DAG.getUNDEF(ShufVT.getVectorElementType());
+
+ SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+ : N->getOperand(1);
+ return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+ Depth+1);
+ }
+
+ // Actual nodes that may contain scalar elements
+ if (Opcode == ISD::BITCAST) {
+ V = V.getOperand(0);
+ EVT SrcVT = V.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+ return SDValue();
+ }
+
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return (Index == 0) ? V.getOperand(0)
+ : DAG.getUNDEF(VT.getVectorElementType());
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR)
+ return V.getOperand(Index);
+
+ return SDValue();
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements of a vector
+/// shuffle operation which come from a consecutively from a zero. The
+/// search can start in two different directions, from left or right.
+static
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
+ bool ZerosFromLeft, SelectionDAG &DAG) {
+ unsigned i;
+ for (i = 0; i != NumElems; ++i) {
+ unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
+ SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
+ if (!(Elt.getNode() &&
+ (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
+ break;
+ }
+
+ return i;
+}
+
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
+/// correspond consecutively to elements from one of the vector operands,
+/// starting from its index OpIdx. Also tell OpNum which source vector operand.
+static
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
+ unsigned MaskI, unsigned MaskE, unsigned OpIdx,
+ unsigned NumElems, unsigned &OpNum) {
+ bool SeenV1 = false;
+ bool SeenV2 = false;
+
+ for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
+ int Idx = SVOp->getMaskElt(i);
+ // Ignore undef indicies
+ if (Idx < 0)
+ continue;
+
+ if (Idx < (int)NumElems)
+ SeenV1 = true;
+ else
+ SeenV2 = true;
+
+ // Only accept consecutive elements from the same vector
+ if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
+ return false;
+ }
+
+ OpNum = SeenV1 ? 0 : 1;
+ return true;
+}
+
+/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+ bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+ unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
+ false /* check zeros from right */, DAG);
+ unsigned OpSrc;
+
+ if (!NumZeros)
+ return false;
+
+ // Considering the elements in the mask that are not consecutive zeros,
+ // check if they consecutively come from only one of the source vectors.
+ //
+ // V1 = {X, A, B, C} 0
+ // \ \ \ /
+ // vector_shuffle V1, V2 <1, 2, 3, X>
+ //
+ if (!isShuffleMaskConsecutive(SVOp,
+ 0, // Mask Start Index
+ NumElems-NumZeros, // Mask End Index(exclusive)
+ NumZeros, // Where to start looking in the src vector
+ NumElems, // Number of elements in vector
+ OpSrc)) // Which source operand ?
+ return false;
+
+ isLeft = false;
+ ShAmt = NumZeros;
+ ShVal = SVOp->getOperand(OpSrc);
+ return true;
+}
+
+/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+ bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+ unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
+ true /* check zeros from left */, DAG);
+ unsigned OpSrc;
+
+ if (!NumZeros)
+ return false;
+
+ // Considering the elements in the mask that are not consecutive zeros,
+ // check if they consecutively come from only one of the source vectors.
+ //
+ // 0 { A, B, X, X } = V2
+ // / \ / /
+ // vector_shuffle V1, V2 <X, X, 4, 5>
+ //
+ if (!isShuffleMaskConsecutive(SVOp,
+ NumZeros, // Mask Start Index
+ NumElems, // Mask End Index(exclusive)
+ 0, // Where to start looking in the src vector
+ NumElems, // Number of elements in vector
+ OpSrc)) // Which source operand ?
+ return false;
+
+ isLeft = true;
+ ShAmt = NumZeros;
+ ShVal = SVOp->getOperand(OpSrc);
+ return true;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+ bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ // Although the logic below support any bitwidth size, there are no
+ // shift instructions which handle more than 128-bit vectors.
+ if (!SVOp->getValueType(0).is128BitVector())
+ return false;
+
+ if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
+ isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
+ return true;
+
+ return false;
+}
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget* Subtarget,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 8)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 16; ++i) {
+ bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+ if (ThisIsNonZero && First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+
+ if ((i & 1) != 0) {
+ SDValue ThisElt(0, 0), LastElt(0, 0);
+ bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ if (LastIsNonZero) {
+ LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+ MVT::i16, Op.getOperand(i-1));
+ }
+ if (ThisIsNonZero) {
+ ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+ ThisElt, DAG.getConstant(8, MVT::i8));
+ if (LastIsNonZero)
+ ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+ } else
+ ThisElt = LastElt;
+
+ if (ThisElt.getNode())
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i/2));
+ }
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget* Subtarget,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 4)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 8; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v8i16, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i));
+ }
+ }
+
+ return V;
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+ unsigned NumBits, SelectionDAG &DAG,
+ const TargetLowering &TLI, DebugLoc dl) {
+ assert(VT.is128BitVector() && "Unknown type for VShift");
+ EVT ShVT = MVT::v2i64;
+ unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+ SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opc, dl, ShVT, SrcOp,
+ DAG.getConstant(NumBits,
+ TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+}
+
+SDValue
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+ SelectionDAG &DAG) const {
+
+ // Check if the scalar load can be widened into a vector load. And if
+ // the address is "base + cst" see if the cst can be "absorbed" into
+ // the shuffle mask.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+ SDValue Ptr = LD->getBasePtr();
+ if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+ return SDValue();
+ EVT PVT = LD->getValueType(0);
+ if (PVT != MVT::i32 && PVT != MVT::f32)
+ return SDValue();
+
+ int FI = -1;
+ int64_t Offset = 0;
+ if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+ FI = FINode->getIndex();
+ Offset = 0;
+ } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+ isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+ FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+ Offset = Ptr.getConstantOperandVal(1);
+ Ptr = Ptr.getOperand(0);
+ } else {
+ return SDValue();
+ }
+
+ // FIXME: 256-bit vector instructions don't require a strict alignment,
+ // improve this code to support it better.
+ unsigned RequiredAlign = VT.getSizeInBits()/8;
+ SDValue Chain = LD->getChain();
+ // Make sure the stack object alignment is at least 16 or 32.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+ if (MFI->isFixedObjectIndex(FI)) {
+ // Can't change the alignment. FIXME: It's possible to compute
+ // the exact stack offset and reference FI + adjust offset instead.
+ // If someone *really* cares about this. That's the way to implement it.
+ return SDValue();
+ } else {
+ MFI->setObjectAlignment(FI, RequiredAlign);
+ }
+ }
+
+ // (Offset % 16 or 32) must be multiple of 4. Then address is then
+ // Ptr + (Offset & ~15).
+ if (Offset < 0)
+ return SDValue();
+ if ((Offset % RequiredAlign) & 3)
+ return SDValue();
+ int64_t StartOffset = Offset & ~(RequiredAlign-1);
+ if (StartOffset)
+ Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+ Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+ int EltNo = (Offset - StartOffset) >> 2;
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+ SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+ LD->getPointerInfo().getWithOffset(StartOffset),
+ false, false, false, 0);
+
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0; i != NumElems; ++i)
+ Mask.push_back(EltNo);
+
+ return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+ }
+
+ return SDValue();
+}
+
+/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
+/// vector of type 'VT', see if the elements can be replaced by a single large
+/// load which has the same value as a build_vector whose operands are 'elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+///
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+ DebugLoc &DL, SelectionDAG &DAG) {
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = Elts.size();
+
+ LoadSDNode *LDBase = NULL;
+ unsigned LastLoadedElt = -1U;
+
+ // For each element in the initializer, see if we've found a load or an undef.
+ // If we don't find an initial load element, or later load elements are
+ // non-consecutive, bail out.
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Elts[i];
+
+ if (!Elt.getNode() ||
+ (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+ return SDValue();
+ if (!LDBase) {
+ if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+ return SDValue();
+ LDBase = cast<LoadSDNode>(Elt.getNode());
+ LastLoadedElt = i;
+ continue;
+ }
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+
+ LoadSDNode *LD = cast<LoadSDNode>(Elt);
+ if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ return SDValue();
+ LastLoadedElt = i;
+ }
+
+ // If we have found an entire vector of loads and undefs, then return a large
+ // load of the entire vector width starting at the base pointer. If we found
+ // consecutive loads for the low half, generate a vzext_load node.
+ if (LastLoadedElt == NumElems - 1) {
+ if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+ return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(),
+ LDBase->isVolatile(), LDBase->isNonTemporal(),
+ LDBase->isInvariant(), 0);
+ return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(),
+ LDBase->isVolatile(), LDBase->isNonTemporal(),
+ LDBase->isInvariant(), LDBase->getAlignment());
+ }
+ if (NumElems == 4 && LastLoadedElt == 1 &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+ SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
+ array_lengthof(Ops), MVT::i64,
+ LDBase->getPointerInfo(),
+ LDBase->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+
+ // Make sure the newly-created LOAD is in the same position as LDBase in
+ // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+ // update uses of LDBase's output chain to use the TokenFactor.
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
+
+ return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
+ }
+ return SDValue();
+}
+
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+SDValue
+X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
+ if (!Subtarget->hasFp256())
+ return SDValue();
+
+ MVT VT = Op.getValueType().getSimpleVT();
+ DebugLoc dl = Op.getDebugLoc();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for broadcast.");
+
+ SDValue Ld;
+ bool ConstSplatVal;
+
+ switch (Op.getOpcode()) {
+ default:
+ // Unknown pattern found.
+ return SDValue();
+
+ case ISD::BUILD_VECTOR: {
+ // The BUILD_VECTOR node must be a splat.
+ if (!isSplatVector(Op.getNode()))
+ return SDValue();
+
+ Ld = Op.getOperand(0);
+ ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+ Ld.getOpcode() == ISD::ConstantFP);
+
+ // The suspected load node has several users. Make sure that all
+ // of its users are from the BUILD_VECTOR node.
+ // Constants may have multiple users.
+ if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+ return SDValue();
+ break;
+ }
+
+ case ISD::VECTOR_SHUFFLE: {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+ // Shuffles must have a splat mask where the first element is
+ // broadcasted.
+ if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+ return SDValue();
+
+ SDValue Sc = Op.getOperand(0);
+ if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+ Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+ if (!Subtarget->hasInt256())
+ return SDValue();
+
+ // Use the register form of the broadcast instruction available on AVX2.
+ if (VT.is256BitVector())
+ Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ }
+
+ Ld = Sc.getOperand(0);
+ ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+ Ld.getOpcode() == ISD::ConstantFP);
+
+ // The scalar_to_vector node and the suspected
+ // load node must have exactly one user.
+ // Constants may have multiple users.
+ if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
+ return SDValue();
+ break;
+ }
+ }
+
+ bool Is256 = VT.is256BitVector();
+
+ // Handle the broadcasting a single constant scalar from the constant pool
+ // into a vector. On Sandybridge it is still better to load a constant vector
+ // from the constant pool and not to broadcast it from a scalar.
+ if (ConstSplatVal && Subtarget->hasInt256()) {
+ EVT CVT = Ld.getValueType();
+ assert(!CVT.isVector() && "Must not broadcast a vector type");
+ unsigned ScalarSize = CVT.getSizeInBits();
+
+ if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
+ const Constant *C = 0;
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+ C = CI->getConstantIntValue();
+ else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+ C = CF->getConstantFPValue();
+
+ assert(C && "Invalid constant type");
+
+ SDValue CP = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ }
+ }
+
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+ unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+ // Handle AVX2 in-register broadcasts.
+ if (!IsLoad && Subtarget->hasInt256() &&
+ (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The scalar source must be a normal load.
+ if (!IsLoad)
+ return SDValue();
+
+ if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+ // double since there is no vbroadcastsd xmm
+ if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+ if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ }
+
+ // Unsupported broadcast.
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // Skip if insert_vec_elt is not supported.
+ if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+ return SDValue();
+
+ DebugLoc DL = Op.getDebugLoc();
+ unsigned NumElems = Op.getNumOperands();
+
+ SDValue VecIn1;
+ SDValue VecIn2;
+ SmallVector<unsigned, 4> InsertIndices;
+ SmallVector<int, 8> Mask(NumElems, -1);
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Opc = Op.getOperand(i).getOpcode();
+
+ if (Opc == ISD::UNDEF)
+ continue;
+
+ if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+ // Quit if more than 1 elements need inserting.
+ if (InsertIndices.size() > 1)
+ return SDValue();
+
+ InsertIndices.push_back(i);
+ continue;
+ }
+
+ SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+ SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
+ // Quit if extracted from vector of different type.
+ if (ExtractedFromVec.getValueType() != VT)
+ return SDValue();
+
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+
+ if (VecIn1.getNode() == 0)
+ VecIn1 = ExtractedFromVec;
+ else if (VecIn1 != ExtractedFromVec) {
+ if (VecIn2.getNode() == 0)
+ VecIn2 = ExtractedFromVec;
+ else if (VecIn2 != ExtractedFromVec)
+ // Quit if more than 2 vectors to shuffle
+ return SDValue();
+ }
+
+ unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+
+ if (ExtractedFromVec == VecIn1)
+ Mask[i] = Idx;
+ else if (ExtractedFromVec == VecIn2)
+ Mask[i] = Idx + NumElems;
+ }
+
+ if (VecIn1.getNode() == 0)
+ return SDValue();
+
+ VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+ SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+ for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+ unsigned Idx = InsertIndices[i];
+ NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+ DAG.getIntPtrConstant(Idx));
+ }
+
+ return NV;
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT ExtVT = VT.getVectorElementType();
+ unsigned NumElems = Op.getNumOperands();
+
+ // Vectors containing all zeros can be matched by pxor and xorps later
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+ // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+ if (VT == MVT::v4i32 || VT == MVT::v8i32)
+ return Op;
+
+ return getZeroVector(VT, Subtarget, DAG, dl);
+ }
+
+ // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+ // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+ // vpcmpeqd on 256-bit vectors.
+ if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+ return Op;
+
+ return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+ }
+
+ SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+ if (Broadcast.getNode())
+ return Broadcast;
+
+ unsigned EVTBits = ExtVT.getSizeInBits();
+
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ bool IsAllConstants = true;
+ SmallSet<SDValue, 8> Values;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+ Values.insert(Elt);
+ if (Elt.getOpcode() != ISD::Constant &&
+ Elt.getOpcode() != ISD::ConstantFP)
+ IsAllConstants = false;
+ if (X86::isZeroNode(Elt))
+ NumZero++;
+ else {
+ NonZeros |= (1 << i);
+ NumNonZero++;
+ }
+ }
+
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NumNonZero == 0)
+ return DAG.getUNDEF(VT);
+
+ // Special case for single non-zero, non-undef, element.
+ if (NumNonZero == 1) {
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+
+ // If this is an insertion of an i64 value on x86-32, and if the top bits of
+ // the value are obviously zero, truncate the value to i32 and do the
+ // insertion that way. Only do this if the value is non-constant or if the
+ // value is a constant being inserted into element 0. It is cheaper to do
+ // a constant pool load than it is to do a movd + shuffle.
+ if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+ (!IsAllConstants || Idx == 0)) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ // Handle SSE only.
+ assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+ EVT VecVT = MVT::v4i32;
+ unsigned VecElts = 4;
+
+ // Truncate the value (which may itself be a constant) to i32, and
+ // convert it to a vector with movd (S2V+shuffle to zero extend).
+ Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+
+ // Now we have our 32-bit value zero extended in the low element of
+ // a vector. If Idx != 0, swizzle it into place.
+ if (Idx != 0) {
+ SmallVector<int, 4> Mask;
+ Mask.push_back(Idx);
+ for (unsigned i = 1; i != VecElts; ++i)
+ Mask.push_back(i);
+ Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
+ &Mask[0]);
+ }
+ return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+ }
+ }
+
+ // If we have a constant or non-constant insertion into the low element of
+ // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+ // the rest of the elements. This will be matched as movd/movq/movss/movsd
+ // depending on what the source datatype is.
+ if (Idx == 0) {
+ if (NumZero == 0)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+ (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+ if (VT.is256BitVector()) {
+ SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+ Item, DAG.getIntPtrConstant(0));
+ }
+ assert(VT.is128BitVector() && "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+
+ if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+ Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ if (VT.is256BitVector()) {
+ SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ } else {
+ assert(VT.is128BitVector() && "Expected an SSE value type!");
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+ return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+ }
+ }
+
+ // Is it a vector logical left shift?
+ if (NumElems == 2 && Idx == 1 &&
+ X86::isZeroNode(Op.getOperand(0)) &&
+ !X86::isZeroNode(Op.getOperand(1))) {
+ unsigned NumBits = VT.getSizeInBits();
+ return getVShift(true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ VT, Op.getOperand(1)),
+ NumBits/2, DAG, *this, dl);
+ }
+
+ if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+ return SDValue();
+
+ // Otherwise, if this is a vector with i32 or f32 elements, and the element
+ // is a non-constant being inserted into an element other than the low one,
+ // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
+ // movd/movss) to move this into the low element, then shuffle it into
+ // place.
+ if (EVTBits == 32) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ // Turn it into a shuffle of zero and zero-extended scalar to vector.
+ Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec.push_back(i == Idx ? 0 : 1);
+ return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+ }
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1) {
+ if (EVTBits == 32) {
+ // Instead of a shuffle like this:
+ // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+ // Check if it's possible to issue this instead.
+ // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+ if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+ return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+ }
+ return SDValue();
+ }
+
+ // A vector full of immediates; various special cases are already
+ // handled, so this is best done with a single constant-pool load.
+ if (IsAllConstants)
+ return SDValue();
+
+ // For AVX-length vectors, build the individual 128-bit pieces and use
+ // shuffles to put them in place.
+ if (VT.is256BitVector()) {
+ SmallVector<SDValue, 32> V;
+ for (unsigned i = 0; i != NumElems; ++i)
+ V.push_back(Op.getOperand(i));
+
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+ // Build both the lower and upper subvector.
+ SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
+ SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
+ NumElems/2);
+
+ // Recreate the wider vector with the lower and upper part.
+ return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ }
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64) {
+ if (NumNonZero == 1) {
+ // One half is zero or undef.
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+ Op.getOperand(Idx));
+ return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+ }
+ return SDValue();
+ }
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16) {
+ SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this);
+ if (V.getNode()) return V;
+ }
+
+ if (EVTBits == 16 && NumElems == 8) {
+ SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this);
+ if (V.getNode()) return V;
+ }
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ SmallVector<SDValue, 8> V(NumElems);
+ if (NumElems == 4 && NumZero > 0) {
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !(NonZeros & (1 << i));
+ if (isZero)
+ V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+ default: break;
+ case 0:
+ V[i] = V[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+ break;
+ case 2:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ case 3:
+ V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ }
+ }
+
+ bool Reverse1 = (NonZeros & 0x3) == 2;
+ bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ int MaskVec[] = {
+ Reverse1 ? 1 : 0,
+ Reverse1 ? 0 : 1,
+ static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+ static_cast<int>(Reverse2 ? NumElems : NumElems+1)
+ };
+ return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+ }
+
+ if (Values.size() > 1 && VT.is128BitVector()) {
+ // Check for a build vector of consecutive loads.
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = Op.getOperand(i);
+
+ // Check for elements which are consecutive loads.
+ SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+ if (LD.getNode())
+ return LD;
+
+ // Check for a build vector from mostly shuffle plus few inserting.
+ SDValue Sh = buildFromShuffleMostly(Op, DAG);
+ if (Sh.getNode())
+ return Sh;
+
+ // For SSE 4.1, use insertps to put the high elements into the low element.
+ if (getSubtarget()->hasSSE41()) {
+ SDValue Result;
+ if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+ else
+ Result = DAG.getUNDEF(VT);
+
+ for (unsigned i = 1; i < NumElems; ++i) {
+ if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+ Op.getOperand(i), DAG.getIntPtrConstant(i));
+ }
+ return Result;
+ }
+
+ // Otherwise, expand into a number of unpckl*, start by extending each of
+ // our (non-undef) elements to the full vector width with the element in the
+ // bottom slot of the vector (which generates no code for SSE).
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ else
+ V[i] = DAG.getUNDEF(VT);
+ }
+
+ // Next, we iteratively mix elements, e.g. for v4f32:
+ // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+ // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+ // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
+ unsigned EltStride = NumElems >> 1;
+ while (EltStride != 0) {
+ for (unsigned i = 0; i < EltStride; ++i) {
+ // If V[i+EltStride] is undef and this is the first round of mixing,
+ // then it is safe to just drop this shuffle: V[i] is already in the
+ // right place, the one element (since it's the first round) being
+ // inserted as undef can be dropped. This isn't safe for successive
+ // rounds because they will permute elements within both vectors.
+ if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+ EltStride == NumElems/2)
+ continue;
+
+ V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+ }
+ EltStride >>= 1;
+ }
+ return V[0];
+ }
+ return SDValue();
+}
+
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ MVT ResVT = Op.getValueType().getSimpleVT();
+
+ assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+
+ return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getNumOperands() == 2);
+
+ // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // from two other 128-bit ones.
+ return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+static SDValue
+LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ // Check the mask for BLEND and build the value.
+ unsigned MaskValue = 0;
+ // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ unsigned NumLanes = (NumElems-1)/8 + 1;
+ unsigned NumElemsInLane = NumElems / NumLanes;
+
+ // Blend for v16i16 should be symetric for the both lanes.
+ for (unsigned i = 0; i < NumElemsInLane; ++i) {
+
+ int SndLaneEltIdx = (NumLanes == 2) ?
+ SVOp->getMaskElt(i + NumElemsInLane) : -1;
+ int EltIdx = SVOp->getMaskElt(i);
+
+ if ((EltIdx < 0 || EltIdx == (int)i) &&
+ (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+ continue;
+
+ if (((unsigned)EltIdx == (i + NumElems)) &&
+ (SndLaneEltIdx < 0 ||
+ (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+ MaskValue |= (1<<i);
+ else
+ return SDValue();
+ }
+
+ // Convert i32 vectors to floating point if it is not AVX2.
+ // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+ MVT BlendVT = VT;
+ if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
+ V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
+ }
+
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+ DAG.getConstant(MaskValue, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all] pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 8> MaskVals;
+
+ // Determine if more than 1 of the words in each of the low and high quadwords
+ // of the result come from the same quadword of one of the two inputs. Undef
+ // mask values count as coming from any quadword, for better codegen.
+ unsigned LoQuad[] = { 0, 0, 0, 0 };
+ unsigned HiQuad[] = { 0, 0, 0, 0 };
+ std::bitset<4> InputQuads;
+ for (unsigned i = 0; i < 8; ++i) {
+ unsigned *Quad = i < 4 ? LoQuad : HiQuad;
+ int EltIdx = SVOp->getMaskElt(i);
+ MaskVals.push_back(EltIdx);
+ if (EltIdx < 0) {
+ ++Quad[0];
+ ++Quad[1];
+ ++Quad[2];
+ ++Quad[3];
+ continue;
+ }
+ ++Quad[EltIdx / 4];
+ InputQuads.set(EltIdx / 4);
+ }
+
+ int BestLoQuad = -1;
+ unsigned MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (LoQuad[i] > MaxQuad) {
+ BestLoQuad = i;
+ MaxQuad = LoQuad[i];
+ }
+ }
+
+ int BestHiQuad = -1;
+ MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (HiQuad[i] > MaxQuad) {
+ BestHiQuad = i;
+ MaxQuad = HiQuad[i];
+ }
+ }
+
+ // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+ // of the two input vectors, shuffle them into one input vector so only a
+ // single pshufb instruction is necessary. If There are more than 2 input
+ // quads, disable the next transformation since it does not help SSSE3.
+ bool V1Used = InputQuads[0] || InputQuads[1];
+ bool V2Used = InputQuads[2] || InputQuads[3];
+ if (Subtarget->hasSSSE3()) {
+ if (InputQuads.count() == 2 && V1Used && V2Used) {
+ BestLoQuad = InputQuads[0] ? 0 : 1;
+ BestHiQuad = InputQuads[2] ? 2 : 3;
+ }
+ if (InputQuads.count() > 2) {
+ BestLoQuad = -1;
+ BestHiQuad = -1;
+ }
+ }
+
+ // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+ // the shuffle mask. If a quad is scored as -1, that means that it contains
+ // words from all 4 input quadwords.
+ SDValue NewV;
+ if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+ int MaskV[] = {
+ BestLoQuad < 0 ? 0 : BestLoQuad,
+ BestHiQuad < 0 ? 1 : BestHiQuad
+ };
+ NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
+ NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
+
+ // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+ // source words for the shuffle, to aid later transformations.
+ bool AllWordsInNewV = true;
+ bool InOrder[2] = { true, true };
+ for (unsigned i = 0; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx != (int)i)
+ InOrder[i/4] = false;
+ if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+ continue;
+ AllWordsInNewV = false;
+ break;
+ }
+
+ bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+ if (AllWordsInNewV) {
+ for (int i = 0; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0)
+ continue;
+ idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+ if ((idx != i) && idx < 4)
+ pshufhw = false;
+ if ((idx != i) && idx > 3)
+ pshuflw = false;
+ }
+ V1 = NewV;
+ V2Used = false;
+ BestLoQuad = 0;
+ BestHiQuad = 1;
+ }
+
+ // If we've eliminated the use of V2, and the new mask is a pshuflw or
+ // pshufhw, that's as cheap as it gets. Return the new shuffle.
+ if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+ unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
+ unsigned TargetMask = 0;
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+ DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+ TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
+ getShufflePSHUFLWImmediate(SVOp);
+ V1 = NewV.getOperand(0);
+ return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
+ }
+ }
+
+ // Promote splats to a larger type which usually leads to more efficient code.
+ // FIXME: Is this true if pshufb is available?
+ if (SVOp->isSplat())
+ return PromoteSplat(SVOp, DAG);
+
+ // If we have SSSE3, and all words of the result are from 1 input vector,
+ // case 2 is generated, otherwise case 3 is generated. If no SSSE3
+ // is present, fall back to case 4.
+ if (Subtarget->hasSSSE3()) {
+ SmallVector<SDValue,16> pshufbMask;
+
+ // If we have elements from both input vectors, set the high bit of the
+ // shuffle mask element to zero out elements that come from V2 in the V1
+ // mask, and elements that come from V1 in the V2 mask, so that the two
+ // results can be OR'd together.
+ bool TwoInputs = V1Used && V2Used;
+ for (unsigned i = 0; i != 8; ++i) {
+ int EltIdx = MaskVals[i] * 2;
+ int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+ int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
+ }
+ V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
+ V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ if (!TwoInputs)
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+
+ // Calculate the shuffle mask for the second input, shuffle it, and
+ // OR it with the first shuffled input.
+ pshufbMask.clear();
+ for (unsigned i = 0; i != 8; ++i) {
+ int EltIdx = MaskVals[i] * 2;
+ int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+ int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
+ }
+ V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
+ V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+ }
+
+ // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+ // and update MaskVals with new element order.
+ std::bitset<8> InOrder;
+ if (BestLoQuad >= 0) {
+ int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
+ for (int i = 0; i != 4; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0) {
+ InOrder.set(i);
+ } else if ((idx / 4) == BestLoQuad) {
+ MaskV[i] = idx & 3;
+ InOrder.set(i);
+ }
+ }
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+ &MaskV[0]);
+
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+ NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
+ NewV.getOperand(0),
+ getShufflePSHUFLWImmediate(SVOp), DAG);
+ }
+ }
+
+ // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+ // and update MaskVals with the new element order.
+ if (BestHiQuad >= 0) {
+ int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
+ for (unsigned i = 4; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0) {
+ InOrder.set(i);
+ } else if ((idx / 4) == BestHiQuad) {
+ MaskV[i] = (idx & 3) + 4;
+ InOrder.set(i);
+ }
+ }
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+ &MaskV[0]);
+
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+ NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
+ NewV.getOperand(0),
+ getShufflePSHUFHWImmediate(SVOp), DAG);
+ }
+ }
+
+ // In case BestHi & BestLo were both -1, which means each quadword has a word
+ // from each of the four input quadwords, calculate the InOrder bitvector now
+ // before falling through to the insert/extract cleanup.
+ if (BestLoQuad == -1 && BestHiQuad == -1) {
+ NewV = V1;
+ for (int i = 0; i != 8; ++i)
+ if (MaskVals[i] < 0 || MaskVals[i] == i)
+ InOrder.set(i);
+ }
+
+ // The other elements are put in the right place using pextrw and pinsrw.
+ for (unsigned i = 0; i != 8; ++i) {
+ if (InOrder[i])
+ continue;
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0)
+ continue;
+ SDValue ExtOp = (EltIdx < 8) ?
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+ DAG.getIntPtrConstant(EltIdx)) :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+ DAG.getIntPtrConstant(EltIdx - 8));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+ DAG.getIntPtrConstant(i));
+ }
+ return NewV;
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG,
+ const X86TargetLowering &TLI) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ ArrayRef<int> MaskVals = SVOp->getMask();
+
+ // Promote splats to a larger type which usually leads to more efficient code.
+ // FIXME: Is this true if pshufb is available?
+ if (SVOp->isSplat())
+ return PromoteSplat(SVOp, DAG);
+
+ // If we have SSSE3, case 1 is generated when all result bytes come from
+ // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
+ // present, fall back to case 3.
+
+ // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ SmallVector<SDValue,16> pshufbMask;
+
+ // If all result elements are from one input vector, then only translate
+ // undef mask values to 0x80 (zero out result) in the pshufb mask.
+ //
+ // Otherwise, we have elements from both input vectors, and must zero out
+ // elements that come from V2 in the first mask, and V1 in the second mask
+ // so that we can OR them together.
+ for (unsigned i = 0; i != 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0 || EltIdx >= 16)
+ EltIdx = 0x80;
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ }
+ V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+
+ // As PSHUFB will zero elements with negative indices, it's safe to ignore
+ // the 2nd operand if it's undefined or zero.
+ if (V2.getOpcode() == ISD::UNDEF ||
+ ISD::isBuildVectorAllZeros(V2.getNode()))
+ return V1;
+
+ // Calculate the shuffle mask for the second input, shuffle it, and
+ // OR it with the first shuffled input.
+ pshufbMask.clear();
+ for (unsigned i = 0; i != 16; ++i) {
+ int EltIdx = MaskVals[i];
+ EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ }
+ V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+ }
+
+ // No SSSE3 - Calculate in place words and then fix all out of place words
+ // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
+ // the 16 different words that comprise the two doublequadword input vectors.
+ V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+ SDValue NewV = V1;
+ for (int i = 0; i != 8; ++i) {
+ int Elt0 = MaskVals[i*2];
+ int Elt1 = MaskVals[i*2+1];
+
+ // This word of the result is all undef, skip it.
+ if (Elt0 < 0 && Elt1 < 0)
+ continue;
+
+ // This word of the result is already in the correct place, skip it.
+ if ((Elt0 == i*2) && (Elt1 == i*2+1))
+ continue;
+
+ SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+ SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+ SDValue InsElt;
+
+ // If Elt0 and Elt1 are defined, are consecutive, and can be load
+ // using a single extract together, load it and store it.
+ if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ continue;
+ }
+
+ // If Elt1 is defined, extract it from the appropriate source. If the
+ // source byte is not also odd, shift the extracted word left 8 bits
+ // otherwise clear the bottom 8 bits if we need to do an or.
+ if (Elt1 >= 0) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ if ((Elt1 & 1) == 0)
+ InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+ DAG.getConstant(8,
+ TLI.getShiftAmountTy(InsElt.getValueType())));
+ else if (Elt0 >= 0)
+ InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+ DAG.getConstant(0xFF00, MVT::i16));
+ }
+ // If Elt0 is defined, extract it from the appropriate source. If the
+ // source byte is not also even, shift the extracted word right 8 bits. If
+ // Elt1 was also defined, OR the extracted values together before
+ // inserting them in the result.
+ if (Elt0 >= 0) {
+ SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+ if ((Elt0 & 1) != 0)
+ InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+ DAG.getConstant(8,
+ TLI.getShiftAmountTy(InsElt0.getValueType())));
+ else if (Elt1 >= 0)
+ InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+ DAG.getConstant(0x00FF, MVT::i16));
+ InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+ : InsElt0;
+ }
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ }
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
+}
+
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // VPSHUFB may be generated if
+ // (1) one of input vector is undefined or zeroinitializer.
+ // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+ // And (2) the mask indexes don't cross the 128-bit lane.
+ if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
+ (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+ return SDValue();
+
+ if (V1IsAllZero && !V2IsAllZero) {
+ CommuteVectorShuffleMask(MaskVals, 32);
+ V1 = V2;
+ }
+ SmallVector<SDValue, 32> pshufbMask;
+ for (unsigned i = 0; i != 32; i++) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0 || EltIdx >= 32)
+ EltIdx = 0x80;
+ else {
+ if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+ // Cross lane is not allowed.
+ return SDValue();
+ EltIdx &= 0xf;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ }
+ return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v32i8, &pshufbMask[0], 32));
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ DebugLoc dl = SVOp->getDebugLoc();
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT NewVT;
+ unsigned Scale;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected!");
+ case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
+ case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
+ case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
+ case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
+ case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+ case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
+ }
+
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i != NumElems; i += Scale) {
+ int StartIdx = -1;
+ for (unsigned j = 0; j != Scale; ++j) {
+ int EltIdx = SVOp->getMaskElt(i+j);
+ if (EltIdx < 0)
+ continue;
+ if (StartIdx < 0)
+ StartIdx = (EltIdx / Scale);
+ if (EltIdx != (int)(StartIdx*Scale + j))
+ return SDValue();
+ }
+ MaskVec.push_back(StartIdx);
+ }
+
+ SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+ SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
+ return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(MVT VT, EVT OpVT,
+ SDValue SrcOp, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget, DebugLoc dl) {
+ if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+ LoadSDNode *LD = NULL;
+ if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+ LD = dyn_cast<LoadSDNode>(SrcOp);
+ if (!LD) {
+ // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+ // instead.
+ MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+ if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
+ SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
+ SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
+ // PR2108
+ OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ OpVT,
+ SrcOp.getOperand(0)
+ .getOperand(0))));
+ }
+ }
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+ DAG.getNode(ISD::BITCAST, dl,
+ OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+ SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned NumLaneElems = NumElems / 2;
+
+ DebugLoc dl = SVOp->getDebugLoc();
+ MVT EltVT = VT.getVectorElementType();
+ MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+ SDValue Output[2];
+
+ SmallVector<int, 16> Mask;
+ for (unsigned l = 0; l < 2; ++l) {
+ // Build a shuffle mask for the output, discovering on the fly which
+ // input vectors to use as shuffle operands (recorded in InputUsed).
+ // If building a suitable shuffle vector proves too hard, then bail
+ // out with UseBuildVector set.
+ bool UseBuildVector = false;
+ int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+ unsigned LaneStart = l * NumLaneElems;
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ // The mask element. This indexes into the input.
+ int Idx = SVOp->getMaskElt(i+LaneStart);
+ if (Idx < 0) {
+ // the mask element does not index into any input vector.
+ Mask.push_back(-1);
+ continue;
+ }
+
+ // The input vector this mask element indexes into.
+ int Input = Idx / NumLaneElems;
+
+ // Turn the index into an offset from the start of the input vector.
+ Idx -= Input * NumLaneElems;
+
+ // Find or create a shuffle vector operand to hold this input.
+ unsigned OpNo;
+ for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+ if (InputUsed[OpNo] == Input)
+ // This input vector is already an operand.
+ break;
+ if (InputUsed[OpNo] < 0) {
+ // Create a new operand for this input vector.
+ InputUsed[OpNo] = Input;
+ break;
+ }
+ }
+
+ if (OpNo >= array_lengthof(InputUsed)) {
+ // More than two input vectors used! Give up on trying to create a
+ // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
+ UseBuildVector = true;
+ break;
+ }
+
+ // Add the mask index for the new shuffle vector.
+ Mask.push_back(Idx + OpNo * NumLaneElems);
+ }
+
+ if (UseBuildVector) {
+ SmallVector<SDValue, 16> SVOps;
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ // The mask element. This indexes into the input.
+ int Idx = SVOp->getMaskElt(i+LaneStart);
+ if (Idx < 0) {
+ SVOps.push_back(DAG.getUNDEF(EltVT));
+ continue;
+ }
+
+ // The input vector this mask element indexes into.
+ int Input = Idx / NumElems;
+
+ // Turn the index into an offset from the start of the input vector.
+ Idx -= Input * NumElems;
+
+ // Extract the vector element by hand.
+ SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+ SVOp->getOperand(Input),
+ DAG.getIntPtrConstant(Idx)));
+ }
+
+ // Construct the output using a BUILD_VECTOR.
+ Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+ SVOps.size());
+ } else if (InputUsed[0] < 0) {
+ // No input vectors were used! The result is undefined.
+ Output[l] = DAG.getUNDEF(NVT);
+ } else {
+ SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+ (InputUsed[0] % 2) * NumLaneElems,
+ DAG, dl);
+ // If only one input was used, use an undefined vector for the other.
+ SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+ Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+ (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
+ // At least one input vector was used. Create a new shuffle vector.
+ Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+ }
+
+ Mask.clear();
+ }
+
+ // Concatenate the result back
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
+static SDValue
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+
+ assert(VT.is128BitVector() && "Unsupported vector size");
+
+ std::pair<int, int> Locs[4];
+ int Mask1[] = { -1, -1, -1, -1 };
+ SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
+
+ unsigned NumHi = 0;
+ unsigned NumLo = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ int Idx = PermMask[i];
+ if (Idx < 0) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else {
+ assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+ if (Idx < 4) {
+ Locs[i] = std::make_pair(0, NumLo);
+ Mask1[NumLo] = Idx;
+ NumLo++;
+ } else {
+ Locs[i] = std::make_pair(1, NumHi);
+ if (2+NumHi < 4)
+ Mask1[2+NumHi] = Idx;
+ NumHi++;
+ }
+ }
+ }
+
+ if (NumLo <= 2 && NumHi <= 2) {
+ // If no more than two elements come from either vector. This can be
+ // implemented with two shuffles. First shuffle gather the elements.
+ // The second shuffle, which takes the first shuffle as both of its
+ // vector operands, put the elements into the right order.
+ V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+ int Mask2[] = { -1, -1, -1, -1 };
+
+ for (unsigned i = 0; i != 4; ++i)
+ if (Locs[i].first != -1) {
+ unsigned Idx = (i < 2) ? 0 : 4;
+ Idx += Locs[i].first * 2 + Locs[i].second;
+ Mask2[i] = Idx;
+ }
+
+ return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+ }
+
+ if (NumLo == 3 || NumHi == 3) {
+ // Otherwise, we must have three elements from one vector, call it X, and
+ // one element from the other, call it Y. First, use a shufps to build an
+ // intermediate vector with the one element from Y and the element from X
+ // that will be in the same half in the final destination (the indexes don't
+ // matter). Then, use a shufps to build the final vector, taking the half
+ // containing the element from Y from the intermediate, and the other half
+ // from X.
+ if (NumHi == 3) {
+ // Normalize it so the 3 elements come from V1.
+ CommuteVectorShuffleMask(PermMask, 4);
+ std::swap(V1, V2);
+ }
+
+ // Find the element from V2.
+ unsigned HiIndex;
+ for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+ int Val = PermMask[HiIndex];
+ if (Val < 0)
+ continue;
+ if (Val >= 4)
+ break;
+ }
+
+ Mask1[0] = PermMask[HiIndex];
+ Mask1[1] = -1;
+ Mask1[2] = PermMask[HiIndex^1];
+ Mask1[3] = -1;
+ V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+ if (HiIndex >= 2) {
+ Mask1[0] = PermMask[0];
+ Mask1[1] = PermMask[1];
+ Mask1[2] = HiIndex & 1 ? 6 : 4;
+ Mask1[3] = HiIndex & 1 ? 4 : 6;
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+ }
+
+ Mask1[0] = HiIndex & 1 ? 2 : 0;
+ Mask1[1] = HiIndex & 1 ? 0 : 2;
+ Mask1[2] = PermMask[2];
+ Mask1[3] = PermMask[3];
+ if (Mask1[2] >= 0)
+ Mask1[2] += 4;
+ if (Mask1[3] >= 0)
+ Mask1[3] += 4;
+ return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+ }
+
+ // Break it into (shuffle shuffle_hi, shuffle_lo).
+ int LoMask[] = { -1, -1, -1, -1 };
+ int HiMask[] = { -1, -1, -1, -1 };
+
+ int *MaskPtr = LoMask;
+ unsigned MaskIdx = 0;
+ unsigned LoIdx = 0;
+ unsigned HiIdx = 2;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (i == 2) {
+ MaskPtr = HiMask;
+ MaskIdx = 1;
+ LoIdx = 0;
+ HiIdx = 2;
+ }
+ int Idx = PermMask[i];
+ if (Idx < 0) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else if (Idx < 4) {
+ Locs[i] = std::make_pair(MaskIdx, LoIdx);
+ MaskPtr[LoIdx] = Idx;
+ LoIdx++;
+ } else {
+ Locs[i] = std::make_pair(MaskIdx, HiIdx);
+ MaskPtr[HiIdx] = Idx;
+ HiIdx++;
+ }
+ }
+
+ SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+ SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+ int MaskOps[] = { -1, -1, -1, -1 };
+ for (unsigned i = 0; i != 4; ++i)
+ if (Locs[i].first != -1)
+ MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
+ return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+static bool MayFoldVectorLoad(SDValue V) {
+ while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ V = V.getOperand(0);
+ if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+ V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+ // BUILD_VECTOR (load), undef
+ V = V.getOperand(0);
+
+ return MayFoldLoad(V);
+}
+
+static
+SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Canonizalize to v2f64.
+ V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
+ V1, DAG));
+}
+
+static
+SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
+ bool HasSSE2) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+
+ assert(VT != MVT::v2i64 && "unsupported shuffle type");
+
+ if (HasSSE2 && VT == MVT::v2f64)
+ return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
+
+ // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
+}
+
+static
+SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+
+ assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
+ "unsupported shuffle type");
+
+ if (V2.getOpcode() == ISD::UNDEF)
+ V2 = V1;
+
+ // v4i32 or v4f32
+ return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
+}
+
+static
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
+ // operand of these instructions is only memory, so check if there's a
+ // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
+ // same masks.
+ bool CanFoldLoad = false;
+
+ // Trivial case, when V2 comes from a load.
+ if (MayFoldVectorLoad(V2))
+ CanFoldLoad = true;
+
+ // When V1 is a load, it can be folded later into a store in isel, example:
+ // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
+ // turns into:
+ // (MOVLPSmr addr:$src1, VR128:$src2)
+ // So, recognize this potential and also use MOVLPS or MOVLPD
+ else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+ CanFoldLoad = true;
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ if (CanFoldLoad) {
+ if (HasSSE2 && NumElems == 2)
+ return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
+
+ if (NumElems == 4)
+ // If we don't care about the second element, proceed to use movss.
+ if (SVOp->getMaskElt(1) != -1)
+ return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+ }
+
+ // movl and movlp will both match v2i64, but v2i64 is never matched by
+ // movl earlier because we make it strict to avoid messing with the movlp load
+ // folding logic (see the code above getMOVLP call). Match it here then,
+ // this is horrible, but will stay like this until we move all shuffle
+ // matching to x86 specific nodes. Note that for the 1st condition all
+ // types are matched with movsd.
+ if (HasSSE2) {
+ // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+ // as to remove this logic from here, as much as possible
+ if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
+ return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+ return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+ }
+
+ assert(VT != MVT::v4i32 && "unsupported shuffle type");
+
+ // Invert the operand order and use SHUFPS to match it.
+ return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
+ getShuffleSHUFImmediate(SVOp), DAG);
+}
+
+// Reduce a vector shuffle to zext.
+SDValue
+X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+ // PMOVZX is only available from SSE41.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // Only AVX2 support 256-bit vector integer extending.
+ if (!Subtarget->hasInt256() && VT.is256BitVector())
+ return SDValue();
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ DebugLoc DL = Op.getDebugLoc();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extending is an unary operation and the element type of the source vector
+ // won't be equal to or larger than i64.
+ if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+ VT.getVectorElementType() == MVT::i64)
+ return SDValue();
+
+ // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+ unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+ while ((1U << Shift) < NumElems) {
+ if (SVOp->getMaskElt(1U << Shift) == 1)
+ break;
+ Shift += 1;
+ // The maximal ratio is 8, i.e. from i8 to i64.
+ if (Shift > 3)
+ return SDValue();
+ }
+
+ // Check the shuffle mask.
+ unsigned Mask = (1U << Shift) - 1;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int EltIdx = SVOp->getMaskElt(i);
+ if ((i & Mask) != 0 && EltIdx != -1)
+ return SDValue();
+ if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+ return SDValue();
+ }
+
+ LLVMContext *Context = DAG.getContext();
+ unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+ EVT NeVT = EVT::getIntegerVT(*Context, NBits);
+ EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
+
+ if (!isTypeLegal(NVT))
+ return SDValue();
+
+ // Simplify the operand as it's prepared to be fed into shuffle.
+ unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+ if (V1.getOpcode() == ISD::BITCAST &&
+ V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ V1.getOperand(0)
+ .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+ SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+ ConstantSDNode *CIdx =
+ dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+ // If it's foldable, i.e. normal load with single use, we will let code
+ // selection to fold it. Otherwise, we will short the conversion sequence.
+ if (CIdx && CIdx->getZExtValue() == 0 &&
+ (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
+ if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+ // The "ext_vec_elt" node is wider than the result node.
+ // In this case we should extract subvector from V.
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
+ unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
+ EVT FullVT = V.getValueType();
+ EVT SubVecVT = EVT::getVectorVT(*Context,
+ FullVT.getVectorElementType(),
+ FullVT.getVectorNumElements()/Ratio);
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
+ DAG.getIntPtrConstant(0));
+ }
+ V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+ }
+ }
+
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
+SDValue
+X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ MVT VT = Op.getValueType().getSimpleVT();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+
+ if (isZeroShuffle(SVOp))
+ return getZeroVector(VT, Subtarget, DAG, dl);
+
+ // Handle splat operations
+ if (SVOp->isSplat()) {
+ // Use vbroadcast whenever the splat comes from a foldable load
+ SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+ if (Broadcast.getNode())
+ return Broadcast;
+ }
+
+ // Check integer expanding shuffles.
+ SDValue NewOp = LowerVectorIntExtend(Op, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
+ // If the shuffle can be profitably rewritten as a narrower shuffle, then
+ // do it!
+ if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
+ VT == MVT::v16i16 || VT == MVT::v32i8) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+ if (NewOp.getNode())
+ return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
+ } else if ((VT == MVT::v4i32 ||
+ (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ // FIXME: Figure out a cleaner way to do this.
+ // Try to make use of movq to zero out the top part.
+ if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+ if (NewOp.getNode()) {
+ MVT NewVT = NewOp.getValueType().getSimpleVT();
+ if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
+ NewVT, true, false))
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
+ DAG, Subtarget, dl);
+ }
+ } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+ if (NewOp.getNode()) {
+ MVT NewVT = NewOp.getValueType().getSimpleVT();
+ if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
+ DAG, Subtarget, dl);
+ }
+ }
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getValueType().getSimpleVT();
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned NumElems = VT.getVectorNumElements();
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsSplat = false;
+ bool V2IsSplat = false;
+ bool HasSSE2 = Subtarget->hasSSE2();
+ bool HasFp256 = Subtarget->hasFp256();
+ bool HasInt256 = Subtarget->hasInt256();
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool OptForSize = MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+ assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
+
+ // Vector shuffle lowering takes 3 steps:
+ //
+ // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
+ // narrowing and commutation of operands should be handled.
+ // 2) Matching of shuffles with known shuffle masks to x86 target specific
+ // shuffle nodes.
+ // 3) Rewriting of unmatched masks into new generic shuffle operations,
+ // so the shuffle can be broken into other shuffles and the legalizer can
+ // try the lowering again.
+ //
+ // The general idea is that no vector_shuffle operation should be left to
+ // be matched during isel, all of them must be converted to a target specific
+ // node here.
+
+ // Normalize the input vectors. Here splats, zeroed vectors, profitable
+ // narrowing and commutation of operands should be handled. The actual code
+ // doesn't include all of those, work in progress...
+ SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
+ SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
+ // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
+ // unpckh_undef). Only use pshufd if speed is more important than size.
+ if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+ if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+ if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
+ V2IsUndef && MayFoldVectorLoad(V1))
+ return getMOVDDup(Op, dl, V1, DAG);
+
+ if (isMOVHLPS_v_undef_Mask(M, VT))
+ return getMOVHighToLow(Op, dl, DAG);
+
+ // Use to match splats
+ if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
+ (VT == MVT::v2f64 || VT == MVT::v2i64))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+ if (isPSHUFDMask(M, VT)) {
+ // The actual implementation will match the mask in the if above and then
+ // during isel it can match several different instructions, not only pshufd
+ // as its name says, sad but true, emulate the behavior for now...
+ if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+ return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+
+ unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
+
+ if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+ return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
+
+ if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
+ return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+ DAG);
+
+ return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
+ TargetMask, DAG);
+ }
+
+ // Check if this can be converted into a logical shift.
+ bool isLeft = false;
+ unsigned ShAmt = 0;
+ SDValue ShVal;
+ bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ if (isShift && ShVal.hasOneUse()) {
+ // If the shifted value has multiple uses, it may be cheaper to use
+ // v_set0 + movlhps or movhlps, etc.
+ MVT EltVT = VT.getVectorElementType();
+ ShAmt *= EltVT.getSizeInBits();
+ return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+ }
+
+ if (isMOVLMask(M, VT)) {
+ if (ISD::isBuildVectorAllZeros(V1.getNode()))
+ return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+ if (!isMOVLPMask(M, VT)) {
+ if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+ return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+
+ if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+ }
+ }
+
+ // FIXME: fold these into legal mask.
+ if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
+ return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+
+ if (isMOVHLPSMask(M, VT))
+ return getMOVHighToLow(Op, dl, DAG);
+
+ if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
+ return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
+
+ if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
+ return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
+
+ if (isMOVLPMask(M, VT))
+ return getMOVLP(Op, dl, DAG, HasSSE2);
+
+ if (ShouldXformToMOVHLPS(M, VT) ||
+ ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ if (isShift) {
+ // No better options. Use a vshldq / vsrldq.
+ MVT EltVT = VT.getVectorElementType();
+ ShAmt *= EltVT.getSizeInBits();
+ return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+ }
+
+ bool Commuted = false;
+ // FIXME: This should also accept a bitcast of a splat? Be careful, not
+ // 1,1,1,1 -> v8i16 though.
+ V1IsSplat = isSplatVector(V1.getNode());
+ V2IsSplat = isSplatVector(V2.getNode());
+
+ // Canonicalize the splat or undef, if present, to be on the RHS.
+ if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+ CommuteVectorShuffleMask(M, NumElems);
+ std::swap(V1, V2);
+ std::swap(V1IsSplat, V2IsSplat);
+ Commuted = true;
+ }
+
+ if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
+ // Shuffling low element of v1 into undef, just return v1.
+ if (V2IsUndef)
+ return V1;
+ // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+ // the instruction selector will not match, so get a canonical MOVL with
+ // swapped operands to undo the commute.
+ return getMOVL(DAG, dl, VT, V2, V1);
+ }
+
+ if (isUNPCKLMask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+
+ if (isUNPCKHMask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+
+ if (V2IsSplat) {
+ // Normalize mask so all entries that point to V2 points to its first
+ // element then try to match unpck{h|l} again. If match, return a
+ // new vector_shuffle with the corrected mask.p
+ SmallVector<int, 8> NewMask(M.begin(), M.end());
+ NormalizeMask(NewMask, NumElems);
+ if (isUNPCKLMask(NewMask, VT, HasInt256, true))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+ if (isUNPCKHMask(NewMask, VT, HasInt256, true))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+ }
+
+ if (Commuted) {
+ // Commute is back and try unpck* again.
+ // FIXME: this seems wrong.
+ CommuteVectorShuffleMask(M, NumElems);
+ std::swap(V1, V2);
+ std::swap(V1IsSplat, V2IsSplat);
+ Commuted = false;
+
+ if (isUNPCKLMask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+
+ if (isUNPCKHMask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+ }
+
+ // Normalize the node to match x86 shuffle ops if needed
+ if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ // The checks below are all present in isShuffleMaskLegal, but they are
+ // inlined here right now to enable us to directly emit target specific
+ // nodes, and remove one by one until they don't return Op anymore.
+
+ if (isPALIGNRMask(M, VT, Subtarget))
+ return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+ getShufflePALIGNRImmediate(SVOp),
+ DAG);
+
+ if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
+ SVOp->getSplatIndex() == 0 && V2IsUndef) {
+ if (VT == MVT::v2f64 || VT == MVT::v2i64)
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+ }
+
+ if (isPSHUFHWMask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
+ getShufflePSHUFHWImmediate(SVOp),
+ DAG);
+
+ if (isPSHUFLWMask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
+ getShufflePSHUFLWImmediate(SVOp),
+ DAG);
+
+ if (isSHUFPMask(M, VT, HasFp256))
+ return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
+ getShuffleSHUFImmediate(SVOp), DAG);
+
+ if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+ if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+ //===--------------------------------------------------------------------===//
+ // Generate target specific nodes for 128 or 256-bit shuffles only
+ // supported in the AVX instruction set.
+ //
+
+ // Handle VMOVDDUPY permutations
+ if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
+ return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+ // Handle VPERMILPS/D* permutations
+ if (isVPERMILPMask(M, VT, HasFp256)) {
+ if (HasInt256 && VT == MVT::v8i32)
+ return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+ getShuffleSHUFImmediate(SVOp), DAG);
+ return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+ getShuffleSHUFImmediate(SVOp), DAG);
+ }
+
+ // Handle VPERM2F128/VPERM2I128 permutations
+ if (isVPERM2X128Mask(M, VT, HasFp256))
+ return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+ V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
+
+ SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
+ if (BlendOp.getNode())
+ return BlendOp;
+
+ if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
+ SmallVector<SDValue, 8> permclMask;
+ for (unsigned i = 0; i != 8; ++i) {
+ permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+ }
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
+ &permclMask[0], 8);
+ // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+ return DAG.getNode(X86ISD::VPERMV, dl, VT,
+ DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+ }
+
+ if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
+ return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
+ getShuffleCLImmediate(SVOp), DAG);
+
+ //===--------------------------------------------------------------------===//
+ // Since no target specific shuffle was selected for this generic one,
+ // lower it into other known shuffles. FIXME: this isn't true yet, but
+ // this is the plan.
+ //
+
+ // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+ if (VT == MVT::v8i16) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ if (VT == MVT::v16i8) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ if (VT == MVT::v32i8) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ // Handle all 128-bit wide vectors with 4 elements, and match them with
+ // several different shuffle types.
+ if (NumElems == 4 && VT.is128BitVector())
+ return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+ // Handle general 256-bit shuffles
+ if (VT.is256BitVector())
+ return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
+
+ return SDValue();
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType().getSimpleVT();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
+ return SDValue();
+
+ if (VT.getSizeInBits() == 8) {
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT.getSizeInBits() == 16) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+ if (Idx == 0)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BITCAST, dl,
+ MVT::v4i32,
+ Op.getOperand(0)),
+ Op.getOperand(1)));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT == MVT::f32) {
+ // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+ // the result back to FR32 register. It's only worth matching if the
+ // result has a single use which is a store or a bitcast to i32. And in
+ // the case of a store, it's not worth it if the index is a constant 0,
+ // because a MOVSSmr can be used instead, which is smaller and faster.
+ if (!Op.hasOneUse())
+ return SDValue();
+ SDNode *User = *Op.getNode()->use_begin();
+ if ((User->getOpcode() != ISD::STORE ||
+ (isa<ConstantSDNode>(Op.getOperand(1)) &&
+ cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+ (User->getOpcode() != ISD::BITCAST ||
+ User->getValueType(0) != MVT::i32))
+ return SDValue();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
+ Op.getOperand(0)),
+ Op.getOperand(1));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+ }
+
+ if (VT == MVT::i32 || VT == MVT::i64) {
+ // ExtractPS/pextrq works with constant index.
+ if (isa<ConstantSDNode>(Op.getOperand(1)))
+ return Op;
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return SDValue();
+
+ SDValue Vec = Op.getOperand(0);
+ MVT VecVT = Vec.getValueType().getSimpleVT();
+
+ // If this is a 256-bit vector result, first extract the 128-bit vector and
+ // then extract the element from the 128-bit vector.
+ if (VecVT.is256BitVector()) {
+ DebugLoc dl = Op.getNode()->getDebugLoc();
+ unsigned NumElems = VecVT.getVectorNumElements();
+ SDValue Idx = Op.getOperand(1);
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+ // Get the 128-bit vector.
+ Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+
+ if (IdxVal >= NumElems/2)
+ IdxVal -= NumElems/2;
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+ DAG.getConstant(IdxVal, MVT::i32));
+ }
+
+ assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+ if (Subtarget->hasSSE41()) {
+ SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+ if (Res.getNode())
+ return Res;
+ }
+
+ MVT VT = Op.getValueType().getSimpleVT();
+ DebugLoc dl = Op.getDebugLoc();
+ // TODO: handle v16i8.
+ if (VT.getSizeInBits() == 16) {
+ SDValue Vec = Op.getOperand(0);
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BITCAST, dl,
+ MVT::v4i32, Vec),
+ Op.getOperand(1)));
+ // Transform it so it match pextrw which produces a 32-bit result.
+ MVT EltVT = MVT::i32;
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ }
+
+ if (VT.getSizeInBits() == 32) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // SHUFPS the element to the lowest double word, then movss.
+ int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
+ MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0));
+ }
+
+ if (VT.getSizeInBits() == 64) {
+ // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+ // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+ // to match extract_elt for f64.
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ int Mask[2] = { 1, -1 };
+ MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0));
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
+ isa<ConstantSDNode>(N2)) {
+ unsigned Opc;
+ if (VT == MVT::v8i16)
+ Opc = X86ISD::PINSRW;
+ else if (VT == MVT::v16i8)
+ Opc = X86ISD::PINSRB;
+ else
+ Opc = X86ISD::PINSRB;
+
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ }
+
+ if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into these
+ // bits. For example (insert (extract, 3), 2) could be matched by putting
+ // the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // combine either bitwise AND or insert of float 0.0 to set these bits.
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+ // Create this as a scalar to vector..
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ }
+
+ if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
+ // PINSR* works with constant index.
+ return Op;
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ // If this is a 256-bit vector result, first extract the 128-bit vector,
+ // insert the element into the extracted half and then place it back.
+ if (VT.is256BitVector()) {
+ if (!isa<ConstantSDNode>(N2))
+ return SDValue();
+
+ // Get the desired 128-bit vector half.
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
+ SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+
+ // Insert the element into the desired half.
+ bool Upper = IdxVal >= NumElems/2;
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+ DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
+
+ // Insert the changed part back to the 256-bit vector
+ return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+ }
+
+ if (Subtarget->hasSSE41())
+ return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+ if (EltVT == MVT::i8)
+ return SDValue();
+
+ if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+ // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+ // as its second argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+ return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+ }
+ return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+ LLVMContext *Context = DAG.getContext();
+ DebugLoc dl = Op.getDebugLoc();
+ MVT OpVT = Op.getValueType().getSimpleVT();
+
+ // If this is a 256-bit vector result, first insert into a 128-bit
+ // vector and then insert into the 256-bit vector.
+ if (!OpVT.is128BitVector()) {
+ // Insert into a 128-bit vector.
+ EVT VT128 = EVT::getVectorVT(*Context,
+ OpVT.getVectorElementType(),
+ OpVT.getVectorNumElements() / 2);
+
+ Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+ // Insert the 128-bit vector.
+ return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+ }
+
+ if (OpVT == MVT::v1i64 &&
+ Op.getOperand(0).getValueType() == MVT::i64)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+ SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ return DAG.getNode(ISD::BITCAST, dl, OpVT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (Subtarget->hasFp256()) {
+ DebugLoc dl = Op.getNode()->getDebugLoc();
+ SDValue Vec = Op.getNode()->getOperand(0);
+ SDValue Idx = Op.getNode()->getOperand(1);
+
+ if (Op.getNode()->getValueType(0).is128BitVector() &&
+ Vec.getNode()->getValueType(0).is256BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Extract128BitVector(Vec, IdxVal, DAG, dl);
+ }
+ }
+ return SDValue();
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (Subtarget->hasFp256()) {
+ DebugLoc dl = Op.getNode()->getDebugLoc();
+ SDValue Vec = Op.getNode()->getOperand(0);
+ SDValue SubVec = Op.getNode()->getOperand(1);
+ SDValue Idx = Op.getNode()->getOperand(2);
+
+ if (Op.getNode()->getValueType(0).is256BitVector() &&
+ SubVec.getNode()->getValueType(0).is128BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ }
+ }
+ return SDValue();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+ CP->getAlignment(),
+ CP->getOffset(), OpFlag);
+ DebugLoc DL = CP->getDebugLoc();
+ Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag) {
+ Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+ OpFlag);
+ DebugLoc DL = JT->getDebugLoc();
+ Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag)
+ Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Result);
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+ OpFlag = X86II::MO_GOTPCREL;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOT;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+ } else if (Subtarget->isPICStyleStubNoDynamic()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY;
+ }
+
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+
+ DebugLoc DL = Op.getDebugLoc();
+ Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->is64Bit()) {
+ Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Result);
+ }
+
+ // For symbols that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlag))
+ Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(), false, false, false, 0);
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ // Create the TargetBlockAddressAddress node.
+ unsigned char OpFlags =
+ Subtarget->ClassifyBlockAddressReference();
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+ OpFlags);
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+ else
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ int64_t Offset, SelectionDAG &DAG) const {
+ // Create the TargetGlobalAddress node, folding in the constant
+ // offset if it is legal.
+ unsigned char OpFlags =
+ Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ SDValue Result;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ // A direct static reference to a global.
+ Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+ Offset = 0;
+ } else {
+ Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+ }
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+ else
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+ Result);
+ }
+
+ // For globals that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlags))
+ Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(), false, false, false, 0);
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+ DAG.getConstant(Offset, getPointerTy()));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+ SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags, bool LocalDynamic = false) {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ DebugLoc dl = GA->getDebugLoc();
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(),
+ OperandFlags);
+
+ X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
+
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA, *InFlag };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ } else {
+ SDValue Ops[] = { Chain, TGA };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ }
+
+ // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+ MFI->setAdjustsStack(true);
+
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ SDValue InFlag;
+ DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+ X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT,
+ bool is64Bit) {
+ DebugLoc dl = GA->getDebugLoc();
+
+ // Get the start address of the TLS block for this module.
+ X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+ .getInfo<X86MachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ SDValue Base;
+ if (is64Bit) {
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+ X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ } else {
+ SDValue InFlag;
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+ Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+ X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ }
+
+ // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+ // of Base.
+
+ // Build x@dtpoff.
+ unsigned char OperandFlags = X86II::MO_DTPOFF;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ // Add x@dtpoff with the base.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT, TLSModel::Model model,
+ bool is64Bit, bool isPIC) {
+ DebugLoc dl = GA->getDebugLoc();
+
+ // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+ Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+ is64Bit ? 257 : 256));
+
+ SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getIntPtrConstant(0),
+ MachinePointerInfo(Ptr),
+ false, false, false, 0);
+
+ unsigned char OperandFlags = 0;
+ // Most TLS accesses are not RIP relative, even on x86-64. One exception is
+ // initialexec.
+ unsigned WrapperKind = X86ISD::Wrapper;
+ if (model == TLSModel::LocalExec) {
+ OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+ } else if (model == TLSModel::InitialExec) {
+ if (is64Bit) {
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+ }
+ } else {
+ llvm_unreachable("Unexpected model");
+ }
+
+ // emit "addl x@ntpoff,%eax" (local exec)
+ // or "addl x@indntpoff,%eax" (initial exec)
+ // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ if (model == TLSModel::InitialExec) {
+ if (isPIC && !is64Bit) {
+ Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+ Offset);
+ }
+
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(), false, false, false,
+ 0);
+ }
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GA->getGlobal();
+
+ if (Subtarget->isTargetELF()) {
+ TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ if (Subtarget->is64Bit())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+ Subtarget->is64Bit());
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+ Subtarget->is64Bit(),
+ getTargetMachine().getRelocationModel() == Reloc::PIC_);
+ }
+ llvm_unreachable("Unknown TLS model.");
+ }
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin only has one model of TLS. Lower to that.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+ X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
+ !Subtarget->is64Bit();
+ if (PIC32)
+ OpFlag = X86II::MO_TLVP_PIC_BASE;
+ else
+ OpFlag = X86II::MO_TLVP;
+ DebugLoc DL = Op.getDebugLoc();
+ SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+ GA->getValueType(0),
+ GA->getOffset(), OpFlag);
+ SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+ // With PIC32, the address is actually $g + Offset.
+ if (PIC32)
+ Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Offset);
+
+ // Lowering the machine isd will make sure everything is in the right
+ // location.
+ SDValue Chain = DAG.getEntryNode();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Args[] = { Chain, Offset };
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+
+ // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setAdjustsStack(true);
+
+ // And our return value (tls address) is in the standard call return value
+ // location.
+ unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+ Chain.getValue(1));
+ }
+
+ if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
+ // Just use the implicit TLS architecture
+ // Need to generate someting similar to:
+ // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+ // ; from TEB
+ // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
+ // mov rcx, qword [rdx+rcx*8]
+ // mov eax, .tls$:tlsvar
+ // [rax+rcx] contains the address
+ // Windows 64bit: gs:0x58
+ // Windows 32bit: fs:__tls_array
+
+ // If GV is an alias then use the aliasee for determining
+ // thread-localness.
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ GV = GA->resolveAliasedGlobal(false);
+ DebugLoc dl = GA->getDebugLoc();
+ SDValue Chain = DAG.getEntryNode();
+
+ // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+ // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+ // use its literal value of 0x2C.
+ Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+ ? Type::getInt8PtrTy(*DAG.getContext(),
+ 256)
+ : Type::getInt32PtrTy(*DAG.getContext(),
+ 257));
+
+ SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
+ (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
+ DAG.getExternalSymbol("_tls_array", getPointerTy()));
+
+ SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
+ MachinePointerInfo(Ptr),
+ false, false, false, 0);
+
+ // Load the _tls_index variable
+ SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+ if (Subtarget->is64Bit())
+ IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
+ IDX, MachinePointerInfo(), MVT::i32,
+ false, false, 0);
+ else
+ IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+ false, false, false, 0);
+
+ SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
+ getPointerTy());
+ IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+ SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+ res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // Get the offset of start of .tls section
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), X86II::MO_SECREL);
+ SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+ }
+
+ llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, MVT::i8))
+ : DAG.getConstant(0, VT);
+
+ SDValue Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ } else {
+ Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+ }
+
+ SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits, MVT::i8));
+ SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ AndNode, DAG.getConstant(0, MVT::i8));
+
+ SDValue Hi, Lo;
+ SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+ SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ } else {
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ }
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT SrcVT = Op.getOperand(0).getValueType();
+
+ if (SrcVT.isVector())
+ return SDValue();
+
+ assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ // These are really Legal; return the operand so the caller accepts it as
+ // Legal.
+ if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+ return Op;
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ Subtarget->is64Bit()) {
+ return Op;
+ }
+
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned Size = SrcVT.getSizeInBits()/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot,
+ MachinePointerInfo::getFixedStack(SSFI),
+ false, false, 0);
+ return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+ SDValue StackSlot,
+ SelectionDAG &DAG) const {
+ // Build the FILD
+ DebugLoc DL = Op.getDebugLoc();
+ SDVTList Tys;
+ bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ if (useSSE)
+ Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+ else
+ Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+ unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+ FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+ MachineMemOperand *MMO;
+ if (FI) {
+ int SSFI = FI->getIndex();
+ MMO =
+ DAG.getMachineFunction()
+ .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOLoad, ByteSize, ByteSize);
+ } else {
+ MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ StackSlot = StackSlot.getOperand(1);
+ }
+ SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+ SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+ X86ISD::FILD, DL,
+ Tys, Ops, array_lengthof(Ops),
+ SrcVT, MMO);
+
+ if (useSSE) {
+ Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {
+ Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+ };
+ MachineMemOperand *MMO =
+ DAG.getMachineFunction()
+ .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+ Ops, array_lengthof(Ops),
+ Op.getValueType(), MMO);
+ Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
+ MachinePointerInfo::getFixedStack(SSFI),
+ false, false, false, 0);
+ }
+
+ return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+ SelectionDAG &DAG) const {
+ // This algorithm is not obvious. Here it is what we're trying to output:
+ /*
+ movq %rax, %xmm0
+ punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+ subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+ #ifdef __SSE3__
+ haddpd %xmm0, %xmm0
+ #else
+ pshufd $0x4e, %xmm0, %xmm1
+ addpd %xmm1, %xmm0
+ #endif
+ */
+
+ DebugLoc dl = Op.getDebugLoc();
+ LLVMContext *Context = DAG.getContext();
+
+ // Build some magic constants.
+ const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ Constant *C0 = ConstantDataVector::get(*Context, CV0);
+ SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+ SmallVector<Constant*,2> CV1;
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 0x4330000000000000ULL))));
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 0x4530000000000000ULL))));
+ Constant *C1 = ConstantVector::get(CV1);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+ // Load the 64-bit value into an XMM register.
+ SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Op.getOperand(0));
+ SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+ CLod0);
+
+ SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Result;
+
+ if (Subtarget->hasSSE3()) {
+ // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+ Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+ } else {
+ SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+ SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+ S2F, 0x4E, DAG);
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+ Sub);
+ }
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+ DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ // FP constant to bias correct the final result.
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ MVT::f64);
+
+ // Load the 32-bit value into an XMM register.
+ SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ Op.getOperand(0));
+
+ // Zero out the upper parts of the register.
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+ Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
+ DAG.getIntPtrConstant(0));
+
+ // Or the load with the bias.
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, Load)),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, Bias)));
+ Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
+ DAG.getIntPtrConstant(0));
+
+ // Subtract the bias.
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+ // Handle final rounding.
+ EVT DestVT = Op.getValueType();
+
+ if (DestVT.bitsLT(MVT::f64))
+ return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+ DAG.getIntPtrConstant(0));
+ if (DestVT.bitsGT(MVT::f64))
+ return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+ // Handle final rounding.
+ return Sub;
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ EVT SVT = N0.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+ SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+ "Custom UINT_TO_FP is not supported!");
+
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ SVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (Op.getValueType().isVector())
+ return lowerUINT_TO_FP_vec(Op, DAG);
+
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(N0))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+ EVT SrcVT = N0.getValueType();
+ EVT DstVT = Op.getValueType();
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i64(Op, DAG);
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i32(Op, DAG);
+ if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+ return SDValue();
+
+ // Make a 64-bit buffer, and use it to build an FILD.
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ if (SrcVT == MVT::i32) {
+ SDValue WordOff = DAG.getConstant(4, getPointerTy());
+ SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+ getPointerTy(), StackSlot, WordOff);
+ SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, MachinePointerInfo(),
+ false, false, 0);
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+ OffsetSlot, MachinePointerInfo(),
+ false, false, 0);
+ SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ return Fild;
+ }
+
+ assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, MachinePointerInfo(),
+ false, false, 0);
+ // For i64 source, we need to add the appropriate power of 2 if the input
+ // was negative. This is the same as the optimization in
+ // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+ // we must be careful to do the computation in x87 extended precision, not
+ // in SSE. (The generic code can't know it's OK to do this, or how to.)
+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ MachineMemOperand *MMO =
+ DAG.getMachineFunction()
+ .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOLoad, 8, 8);
+
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+ array_lengthof(Ops), MVT::i64, MMO);
+
+ APInt FF(32, 0x5F800000ULL);
+
+ // Check whether the sign bit is set.
+ SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+ Op.getOperand(0), DAG.getConstant(0, MVT::i64),
+ ISD::SETLT);
+
+ // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+ SDValue FudgePtr = DAG.getConstantPool(
+ ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+ getPointerTy());
+
+ // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+ SDValue Zero = DAG.getIntPtrConstant(0);
+ SDValue Four = DAG.getIntPtrConstant(4);
+ SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+ Zero, Four);
+ FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+
+ // Load the value out, extending it from f32 to f80.
+ // FIXME: Avoid the extend by constructing the right constant pool?
+ SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+ FudgePtr, MachinePointerInfo::getConstantPool(),
+ MVT::f32, false, false, 4);
+ // Extend everything to 80 bits to force it to be done on x87.
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+ return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+}
+
+std::pair<SDValue,SDValue>
+X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool IsSigned, bool IsReplace) const {
+ DebugLoc DL = Op.getDebugLoc();
+
+ EVT DstTy = Op.getValueType();
+
+ if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+ assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+ DstTy = MVT::i64;
+ }
+
+ assert(DstTy.getSimpleVT() <= MVT::i64 &&
+ DstTy.getSimpleVT() >= MVT::i16 &&
+ "Unknown FP_TO_INT to lower!");
+
+ // These are really Legal.
+ if (DstTy == MVT::i32 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+ if (Subtarget->is64Bit() &&
+ DstTy == MVT::i64 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+
+ // We lower FP->int64 either into FISTP64 followed by a load from a temporary
+ // stack slot, or into the FTOL runtime function.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = DstTy.getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ unsigned Opc;
+ if (!IsSigned && isIntegerTypeFTOL(DstTy))
+ Opc = X86ISD::WIN_FTOL;
+ else
+ switch (DstTy.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
+
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Value = Op.getOperand(0);
+ EVT TheVT = Op.getOperand(0).getValueType();
+ // FIXME This causes a redundant load/store if the SSE-class value is already
+ // in memory, such as if it is on the callstack.
+ if (isScalarFPTypeInSSEReg(TheVT)) {
+ assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+ MachinePointerInfo::getFixedStack(SSFI),
+ false, false, 0);
+ SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+ SDValue Ops[] = {
+ Chain, StackSlot, DAG.getValueType(TheVT)
+ };
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOLoad, MemSize, MemSize);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
+ array_lengthof(Ops), DstTy, MMO);
+ Chain = Value.getValue(1);
+ SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ }
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOStore, MemSize, MemSize);
+
+ if (Opc != X86ISD::WIN_FTOL) {
+ // Build the FP_TO_INT*_IN_MEM
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ Ops, array_lengthof(Ops), DstTy,
+ MMO);
+ return std::make_pair(FIST, StackSlot);
+ } else {
+ SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue),
+ Chain, Value);
+ SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
+ MVT::i32, ftol.getValue(1));
+ SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
+ MVT::i32, eax.getValue(2));
+ SDValue Ops[] = { eax, edx };
+ SDValue pair = IsReplace
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
+ : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+ return std::make_pair(pair, SDValue());
+ }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MVT VT = Op->getValueType(0).getSimpleVT();
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getValueType().getSimpleVT();
+ DebugLoc dl = Op->getDebugLoc();
+
+ // Optimize vectors in AVX mode:
+ //
+ // v8i16 -> v8i32
+ // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
+ // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
+ // Concat upper and lower parts.
+ //
+ // v4i32 -> v4i64
+ // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
+ // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
+ // Concat upper and lower parts.
+ //
+
+ if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+ ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+ return SDValue();
+
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
+
+ SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+ SDValue Undef = DAG.getUNDEF(InVT);
+ bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+ SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+ MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Subtarget->hasFp256()) {
+ SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+ if (Res.getNode())
+ return Res;
+ }
+
+ return SDValue();
+}
+SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getValueType().getSimpleVT();
+
+ if (Subtarget->hasFp256()) {
+ SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+ if (Res.getNode())
+ return Res;
+ }
+
+ if (!VT.is256BitVector() || !SVT.is128BitVector() ||
+ VT.getVectorNumElements() != SVT.getVectorNumElements())
+ return SDValue();
+
+ assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
+
+ // AVX2 has better support of integer extending.
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+ SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
+ static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
+ SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
+ DAG.getVectorShuffle(MVT::v8i16, DL, In,
+ DAG.getUNDEF(MVT::v8i16),
+ &Mask[0]));
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getValueType().getSimpleVT();
+
+ if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+ // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+ if (Subtarget->hasInt256()) {
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+ In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+ ShufMask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(2));
+
+ OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+ // The PSHUFD mask:
+ static const int ShufMask1[] = {0, 2, 0, 0};
+ SDValue Undef = DAG.getUNDEF(VT);
+ OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
+
+ // The MOVLHPS mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
+ }
+
+ if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+ // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+ if (Subtarget->hasInt256()) {
+ In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+
+ SmallVector<SDValue,32> pshufbMask;
+ for (unsigned i = 0; i < 2; ++i) {
+ pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+ for (unsigned j = 0; j < 8; ++j)
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
+ &pshufbMask[0], 32);
+ In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+ In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+
+ static const int ShufMask[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
+ &ShufMask[0]);
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::BITCAST, DL, VT, In);
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(0));
+
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4));
+
+ OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+
+ // The PSHUFB mask:
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+ OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+ // The MOVLHPS Mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+ }
+
+ // Handle truncation of V256 to V128 using shuffles.
+ if (!VT.is128BitVector() || !SVT.is256BitVector())
+ return SDValue();
+
+ assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
+ "Invalid op");
+ assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ NumElems * 2);
+
+ SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+ // Prepare truncation shuffle mask
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec[i] = i * 2;
+ SDValue V = DAG.getVectorShuffle(NVT, DL,
+ DAG.getNode(ISD::BITCAST, DL, NVT, In),
+ DAG.getUNDEF(NVT), &MaskVec[0]);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+ DAG.getIntPtrConstant(0));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getValueType().getSimpleVT();
+ if (VT.isVector()) {
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
+ DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+ MVT::v8i32, Op.getOperand(0)));
+ return SDValue();
+ }
+
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ /*IsSigned=*/ true, /*IsReplace=*/ false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (FIST.getNode() == 0) return Op;
+
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // The node is the result.
+ return FIST;
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ /*IsSigned=*/ false, /*IsReplace=*/ false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ assert(FIST.getNode() && "Unexpected failure");
+
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // The node is the result.
+ return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc DL = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getValueType().getSimpleVT();
+
+ assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+ In, DAG.getUNDEF(SVT)));
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
+ LLVMContext *Context = DAG.getContext();
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT;
+ unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+ if (VT.isVector()) {
+ EltVT = VT.getVectorElementType();
+ NumElts = VT.getVectorNumElements();
+ }
+ Constant *C;
+ if (EltVT == MVT::f64)
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, ~(1ULL << 63))));
+ else
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+ APInt(32, ~(1U << 31))));
+ C = ConstantVector::getSplat(NumElts, C);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+ SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+ if (VT.isVector()) {
+ MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(ISD::AND, dl, ANDVT,
+ DAG.getNode(ISD::BITCAST, dl, ANDVT,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+ }
+ return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
+ LLVMContext *Context = DAG.getContext();
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT;
+ unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+ if (VT.isVector()) {
+ EltVT = VT.getVectorElementType();
+ NumElts = VT.getVectorNumElements();
+ }
+ Constant *C;
+ if (EltVT == MVT::f64)
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 1ULL << 63)));
+ else
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+ APInt(32, 1U << 31)));
+ C = ConstantVector::getSplat(NumElts, C);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+ SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+ if (VT.isVector()) {
+ MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(ISD::XOR, dl, XORVT,
+ DAG.getNode(ISD::BITCAST, dl, XORVT,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+ }
+
+ return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+ LLVMContext *Context = DAG.getContext();
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT SrcVT = Op1.getValueType().getSimpleVT();
+
+ // If second operand is smaller, extend it first.
+ if (SrcVT.bitsLT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+ SrcVT = VT;
+ }
+ // And if it is bigger, shrink it first.
+ if (SrcVT.bitsGT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+ SrcVT = VT;
+ }
+
+ // At this point the operands and the result should have the same
+ // type, and that won't be f80 since that is not custom lowered.
+
+ // First get the sign bit of second operand.
+ SmallVector<Constant*,4> CV;
+ if (SrcVT == MVT::f64) {
+ const fltSemantics &Sem = APFloat::IEEEdouble;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+ } else {
+ const fltSemantics &Sem = APFloat::IEEEsingle;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+ // Shift sign bit right or left if the two operands have different types.
+ if (SrcVT.bitsGT(VT)) {
+ // Op0 is MVT::f32, Op1 is MVT::f64.
+ SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+ SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+ DAG.getConstant(32, MVT::i32));
+ SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
+ SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // Clear first operand sign bit.
+ CV.clear();
+ if (VT == MVT::f64) {
+ const fltSemantics &Sem = APFloat::IEEEdouble;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+ APInt(64, ~(1ULL << 63)))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+ } else {
+ const fltSemantics &Sem = APFloat::IEEEsingle;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+ APInt(32, ~(1U << 31)))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ }
+ C = ConstantVector::get(CV);
+ CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+ // Or the value with the sign bit.
+ return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType().getSimpleVT();
+
+ // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+ SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+ DAG.getConstant(1, VT));
+ return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ if (!Op->hasOneUse())
+ return SDValue();
+
+ SDNode *N = Op.getNode();
+ DebugLoc DL = N->getDebugLoc();
+
+ SmallVector<SDValue, 8> Opnds;
+ DenseMap<SDValue, unsigned> VecInMap;
+ EVT VT = MVT::Other;
+
+ // Recognize a special case where a vector is casted into wide integer to
+ // test all 0s.
+ Opnds.push_back(N->getOperand(0));
+ Opnds.push_back(N->getOperand(1));
+
+ for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+ SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+ // BFS traverse all OR'd operands.
+ if (I->getOpcode() == ISD::OR) {
+ Opnds.push_back(I->getOperand(0));
+ Opnds.push_back(I->getOperand(1));
+ // Re-evaluate the number of nodes to be traversed.
+ e += 2; // 2 more nodes (LHS and RHS) are pushed.
+ continue;
+ }
+
+ // Quit if a non-EXTRACT_VECTOR_ELT
+ if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Quit if without a constant index.
+ SDValue Idx = I->getOperand(1);
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ SDValue ExtractedFromVec = I->getOperand(0);
+ DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+ if (M == VecInMap.end()) {
+ VT = ExtractedFromVec.getValueType();
+ // Quit if not 128/256-bit vector.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+ // Quit if not the same type.
+ if (VecInMap.begin() != VecInMap.end() &&
+ VT != VecInMap.begin()->first.getValueType())
+ return SDValue();
+ M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+ }
+ M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ }
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Not extracted from 128-/256-bit vector.");
+
+ unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+ SmallVector<SDValue, 8> VecIns;
+
+ for (DenseMap<SDValue, unsigned>::const_iterator
+ I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+ // Quit if not all elements are used.
+ if (I->second != FullMask)
+ return SDValue();
+ VecIns.push_back(I->first);
+ }
+
+ EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+ // Cast all vectors into TestVT for PTEST.
+ for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+ VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+ // If more than one full vectors are evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is only
+ // 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+ }
+
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+ VecIns.back(), VecIns.back());
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ NeedOF = true;
+ break;
+ }
+
+ // See if we can use the EFLAGS value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() != 0 || NeedOF || NeedCF)
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, Op.getValueType()));
+
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+
+ // Truncate operations may prevent the merge of the SETCC instruction
+ // and the arithmetic intruction before it. Attempt to truncate the operands
+ // of the arithmetic instruction and use a reduced bit-width instruction.
+ bool NeedTruncation = false;
+ SDValue ArithOp = Op;
+ if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+ SDValue Arith = Op->getOperand(0);
+ // Both the trunc and the arithmetic op need to have one user each.
+ if (Arith->hasOneUse())
+ switch (Arith.getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ NeedTruncation = true;
+ ArithOp = Arith;
+ }
+ }
+ }
+
+ // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+ // which may be the result of a CAST. We use the variable 'Op', which is the
+ // non-casted variable when we check for possible users.
+ switch (ArithOp.getOpcode()) {
+ case ISD::ADD:
+ // Due to an isel shortcoming, be conservative if this add is likely to be
+ // selected as part of a load-modify-store instruction. When the root node
+ // in a match is a store, isel doesn't know how to remap non-chain non-flag
+ // uses of other nodes in the match, such as the ADD in this case. This
+ // leads to the ADD being left around and reselected, with the result being
+ // two adds in the output. Alas, even if none our users are stores, that
+ // doesn't prove we're O.K. Ergo, if we have any parents that aren't
+ // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
+ // climbing the DAG back to the root, and it doesn't seem to be worth the
+ // effort.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() != ISD::CopyToReg &&
+ UI->getOpcode() != ISD::SETCC &&
+ UI->getOpcode() != ISD::STORE)
+ goto default_case;
+
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
+ // An add of one will be selected as an INC.
+ if (C->getAPIntValue() == 1) {
+ Opcode = X86ISD::INC;
+ NumOperands = 1;
+ break;
+ }
+
+ // An add of negative one (subtract of one) will be selected as a DEC.
+ if (C->getAPIntValue().isAllOnesValue()) {
+ Opcode = X86ISD::DEC;
+ NumOperands = 1;
+ break;
+ }
+ }
+
+ // Otherwise use a regular EFLAGS-setting add.
+ Opcode = X86ISD::ADD;
+ NumOperands = 2;
+ break;
+ case ISD::AND: {
+ // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ bool NonFlagUse = false;
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND &&
+ User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
+ NonFlagUse = true;
+ break;
+ }
+ }
+
+ if (!NonFlagUse)
+ break;
+ }
+ // FALL THROUGH
+ case ISD::SUB:
+ case ISD::OR:
+ case ISD::XOR:
+ // Due to the ISEL shortcoming noted above, be conservative if this op is
+ // likely to be selected as part of a load-modify-store instruction.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() == ISD::STORE)
+ goto default_case;
+
+ // Otherwise use a regular EFLAGS-setting instruction.
+ switch (ArithOp.getOpcode()) {
+ default: llvm_unreachable("unexpected operator!");
+ case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::XOR: Opcode = X86ISD::XOR; break;
+ case ISD::AND: Opcode = X86ISD::AND; break;
+ case ISD::OR: {
+ if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+ if (EFLAGS.getNode())
+ return EFLAGS;
+ }
+ Opcode = X86ISD::OR;
+ break;
+ }
+ }
+
+ NumOperands = 2;
+ break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ return SDValue(Op.getNode(), 1);
+ default:
+ default_case:
+ break;
+ }
+
+ // If we found that truncation is beneficial, perform the truncation and
+ // update 'Op'.
+ if (NeedTruncation) {
+ EVT VT = Op.getValueType();
+ SDValue WideVal = Op->getOperand(0);
+ EVT WideVT = WideVal.getValueType();
+ unsigned ConvertedOp = 0;
+ // Use a target machine opcode to prevent further DAGCombine
+ // optimizations that may separate the arithmetic operations
+ // from the setcc node.
+ switch (WideVal.getOpcode()) {
+ default: break;
+ case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+ case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+ case ISD::AND: ConvertedOp = X86ISD::AND; break;
+ case ISD::OR: ConvertedOp = X86ISD::OR; break;
+ case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+ }
+
+ if (ConvertedOp) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+ SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+ SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+ Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+ }
+ }
+ }
+
+ if (Opcode == 0)
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, Op.getValueType()));
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0; i != NumOperands; ++i)
+ Ops.push_back(Op.getOperand(i));
+
+ SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+ DAG.ReplaceAllUsesWith(Op, New);
+ return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG) const {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+ if (C->getAPIntValue() == 0)
+ return EmitTest(Op0, X86CC, DAG);
+
+ DebugLoc dl = Op0.getDebugLoc();
+ if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+ Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+ Op0, Op1);
+ return SDValue(Sub.getNode(), 1);
+ }
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+ SelectionDAG &DAG) const {
+ // If the subtarget does not support the FUCOMI instruction, floating-point
+ // comparisons have to be converted.
+ if (Subtarget->hasCMov() ||
+ Cmp.getOpcode() != X86ISD::CMP ||
+ !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ return Cmp;
+
+ // The instruction selector will select an FUCOM instruction instead of
+ // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+ // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ DebugLoc dl = Cmp.getDebugLoc();
+ SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+ SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+ DAG.getConstant(8, MVT::i8));
+ SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+ return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+static bool isAllOnes(SDValue V) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+ return C && C->isAllOnesValue();
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
+ DebugLoc dl, SelectionDAG &DAG) const {
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ SDValue LHS, RHS;
+ if (Op1.getOpcode() == ISD::SHL)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
+ if (And00C->getZExtValue() == 1) {
+ // If we looked past a truncate, check that it's only truncating away
+ // known zeros.
+ unsigned BitWidth = Op0.getValueSizeInBits();
+ unsigned AndBitWidth = And.getValueSizeInBits();
+ if (BitWidth > AndBitWidth) {
+ APInt Zeros, Ones;
+ DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+ if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+ return SDValue();
+ }
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
+ }
+ } else if (Op1.getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ uint64_t AndRHSVal = AndRHS->getZExtValue();
+ SDValue AndLHS = Op0;
+
+ if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+
+ // Use BT if the immediate can't be encoded in a TEST instruction.
+ if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+ LHS = AndLHS;
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+ }
+ }
+
+ if (LHS.getNode()) {
+ // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (LHS.getValueType() == MVT::i8 ||
+ LHS.getValueType() == MVT::i16)
+ LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (LHS.getValueType() != RHS.getValueType())
+ RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+ SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+ X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, MVT::i8), BT);
+ }
+
+ return SDValue();
+}
+
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType().getSimpleVT();
+
+ assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC = Op.getOperand(2);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+ // Issue the operation on the smaller types and concatenate the result back
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getValueType().getSimpleVT();
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (isFP) {
+#ifndef NDEBUG
+ MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+ unsigned SSECC;
+ bool Swap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true; // Fallthrough
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true; // Fallthrough
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ:
+ case ISD::SETONE: SSECC = 8; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // In the two special cases we can't handle, emit two comparisons.
+ if (SSECC == 8) {
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
+ if (SetCCOpcode == ISD::SETUEQ) {
+ CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+ } else {
+ assert(SetCCOpcode == ISD::SETONE);
+ CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+ }
+
+ SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(CC0, MVT::i8));
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(CC1, MVT::i8));
+ return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ }
+ // Handle all other FP comparisons here.
+ return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, MVT::i8));
+ }
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget->hasInt256())
+ return Lower256IntVSETCC(Op, DAG);
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc;
+ bool Swap = false, Invert = false, FlipSigns = false;
+
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: Invert = true;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
+ case ISD::SETLT: Swap = true;
+ case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
+ case ISD::SETGE: Swap = true;
+ case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
+ case ISD::SETUGE: Swap = true;
+ case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Check that the operation in question is available (most are plain SSE2,
+ // but PCMPGTQ and PCMPEQQ have different requirements).
+ if (VT == MVT::v2i64) {
+ if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
+ assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations. The lower
+ // compare is always unsigned.
+ SDValue SB;
+ if (FlipSigns) {
+ SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+ } else {
+ SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
+ SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+ SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+ Sign, Zero, Sign, Zero);
+ }
+ Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+ // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+ // Create masks for only the low parts/high parts of the 64 bit integers.
+ const int MaskHi[] = { 1, 1, 3, 3 };
+ const int MaskLo[] = { 0, 0, 2, 2 };
+ SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+ SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+ SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+ Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+
+ if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+ // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+ // pcmpeqd + pshufd + pand.
+ assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+ // Do the compare.
+ SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+ // Make sure the lower and upper halves are both all-ones.
+ const int Mask[] = { 1, 0, 3, 2 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+ Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+ }
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ EVT EltVT = VT.getVectorElementType();
+ SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ MVT VT = Op.getValueType().getSimpleVT();
+
+ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+ assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+ Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Op1)->isNullValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+ if (NewSetCC.getNode())
+ return NewSetCC;
+ }
+
+ // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
+ // these.
+ if (Op1.getOpcode() == ISD::Constant &&
+ (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+ cast<ConstantSDNode>(Op1)->isNullValue()) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ // If the input is a setcc, then reuse the input setcc or use a new one with
+ // the inverted condition.
+ if (Op0.getOpcode() == X86ISD::SETCC) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^
+ cast<ConstantSDNode>(Op1)->isNullValue();
+ if (!Invert) return Op0;
+
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+ }
+ }
+
+ bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+ unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+ if (X86CC == X86::COND_INVALID)
+ return SDValue();
+
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getNode()->getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+ Opc == X86ISD::SAHF)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == X86ISD::ADD ||
+ Opc == X86ISD::SUB ||
+ Opc == X86ISD::ADC ||
+ Opc == X86ISD::SBB ||
+ Opc == X86ISD::SMUL ||
+ Opc == X86ISD::UMUL ||
+ Opc == X86ISD::INC ||
+ Opc == X86ISD::DEC ||
+ Opc == X86ISD::OR ||
+ Opc == X86ISD::XOR ||
+ Opc == X86ISD::AND))
+ return true;
+
+ if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+ return true;
+
+ return false;
+}
+
+static bool isZero(SDValue V) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+ return C && C->isNullValue();
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+ if (V.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue VOp0 = V.getOperand(0);
+ unsigned InBits = VOp0.getValueSizeInBits();
+ unsigned Bits = V.getValueSizeInBits();
+ return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Cond = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ DebugLoc DL = Op.getDebugLoc();
+ SDValue CC;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
+
+ // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+ // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+ // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+ // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ if (Cond.getOpcode() == X86ISD::SETCC &&
+ Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+ isZero(Cond.getOperand(1).getOperand(1))) {
+ SDValue Cmp = Cond.getOperand(1);
+
+ unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+ if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
+ (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+ SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
+
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+ if (YC->isNullValue() &&
+ (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+ DAG.getConstant(0, CmpOp0.getValueType()),
+ CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
+
+ Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+ SDValue Res = // Res = 0 or -1.
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
+
+ if (isAllOnes(Op1) != (CondCode == X86::COND_E))
+ Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+ ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
+ if (N2C == 0 || !N2C->isNullValue())
+ Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+ return Res;
+ }
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+ if (C && C->getAPIntValue() == 1)
+ Cond = Cond.getOperand(0);
+ }
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ MVT VT = Op.getValueType().getSimpleVT();
+
+ bool IllegalFPCMov = false;
+ if (VT.isFloatingPoint() && !VT.isVector() &&
+ !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Opc == X86ISD::BT) { // FIXME
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, MVT::i8);
+ addTest = false;
+ }
+
+ if (addTest) {
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
+ if (NewSetCC.getNode()) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ }
+
+ // a < b ? -1 : 0 -> RES = ~setcc_carry
+ // a < b ? 0 : -1 -> RES = setcc_carry
+ // a >= b ? -1 : 0 -> RES = setcc_carry
+ // a >= b ? 0 : -1 -> RES = ~setcc_carry
+ if (Cond.getOpcode() == X86ISD::SUB) {
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
+ unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+ if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+ (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, MVT::i8), Cond);
+ if (isAllOnes(Op1) != (CondCode == X86::COND_B))
+ return DAG.getNOT(DL, Res, Res.getValueType());
+ return Res;
+ }
+ }
+
+ // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+ // widen the cmov and push the truncate through. This avoids introducing a new
+ // branch during isel and doesn't add any extensions.
+ if (Op.getValueType() == MVT::i8 &&
+ Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ // Blacklist CopyFromReg to avoid partial register stalls.
+ T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+ }
+
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+}
+
+SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op->getValueType(0).getSimpleVT();
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getValueType().getSimpleVT();
+ DebugLoc dl = Op->getDebugLoc();
+
+ if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+ (VT != MVT::v8i32 || InVT != MVT::v8i16))
+ return SDValue();
+
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
+ unsigned NumElems = InVT.getVectorNumElements();
+ SDValue Undef = DAG.getUNDEF(InVT);
+
+ SmallVector<int,8> ShufMask1(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask1[i] = i;
+
+ SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+
+ SmallVector<int,8> ShufMask2(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask2[i] = i + NumElems/2;
+
+ SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
+ OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse() &&
+ Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+ if (Op.getOpcode() != ISD::XOR)
+ return false;
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (N1C && N1C->getAPIntValue() == 1) {
+ return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse();
+ }
+ return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC;
+ bool Inverted = false;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ // Check for setcc([su]{add,sub,mul}o == 0).
+ if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ isa<ConstantSDNode>(Cond.getOperand(1)) &&
+ cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+ Cond.getOperand(0).getResNo() == 1 &&
+ (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+ Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+ Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+ Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+ Inverted = true;
+ Cond = Cond.getOperand(0);
+ } else {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
+ }
+#if 0
+ // FIXME: LowerXALUO doesn't handle these!!
+ else if (Cond.getOpcode() == X86ISD::ADD ||
+ Cond.getOpcode() == X86ISD::SUB ||
+ Cond.getOpcode() == X86ISD::SMUL ||
+ Cond.getOpcode() == X86ISD::UMUL)
+ Cond = LowerXALUO(Cond, DAG);
+#endif
+
+ // Look pass (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+ if (C && C->getAPIntValue() == 1)
+ Cond = Cond.getOperand(0);
+ }
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+ if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+ Cond = Cmp;
+ addTest = false;
+ } else {
+ switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ default: break;
+ case X86::COND_O:
+ case X86::COND_B:
+ // These can only come from an arithmetic instruction with overflow,
+ // e.g. SADDO, UADDO.
+ Cond = Cond.getNode()->getOperand(1);
+ addTest = false;
+ break;
+ }
+ }
+ }
+ CondOpcode = Cond.getOpcode();
+ if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+ Cond.getOperand(0).getValueType() != MVT::i8)) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ unsigned X86Opcode;
+ unsigned X86Cond;
+ SDVTList VTs;
+ switch (CondOpcode) {
+ case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+ case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+ case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+ case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+ case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+ case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+ default: llvm_unreachable("unexpected overflowing operator");
+ }
+ if (Inverted)
+ X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+ if (CondOpcode == ISD::UMULO)
+ VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+ MVT::i32);
+ else
+ VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+ SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+ if (CondOpcode == ISD::UMULO)
+ Cond = X86Op.getValue(2);
+ else
+ Cond = X86Op.getValue(1);
+
+ CC = DAG.getConstant(X86Cond, MVT::i8);
+ addTest = false;
+ } else {
+ unsigned CondOpc;
+ if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+ SDValue Cmp = Cond.getOperand(0).getOperand(1);
+ if (CondOpc == ISD::OR) {
+ // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp)) {
+ CC = Cond.getOperand(0).getOperand(0);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = Cond.getOperand(1).getOperand(0);
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else { // ISD::AND
+ // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp) &&
+ Op.getNode()->hasOneUse()) {
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ }
+ } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+ // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+ // It should be transformed during dag combiner except when the condition
+ // is set by a arithmetics with overflow node.
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ Cond = Cond.getOperand(0).getOperand(1);
+ addTest = false;
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+ // For FCMP_OEQ, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_P, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ } else if (Cond.getOpcode() == ISD::SETCC &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+ // For FCMP_UNE, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_UNE.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+
+ SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ Dest = FalseBB;
+ }
+ }
+ }
+ }
+
+ if (addTest) {
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+ if (NewSetCC.getNode()) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ }
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
+ return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+ getTargetMachine().Options.EnableSegmentedStacks) &&
+ "This should be used only on Windows targets or when segmented stacks "
+ "are being used");
+ assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ // FIXME: Ensure alignment here
+
+ bool Is64Bit = Subtarget->is64Bit();
+ EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+
+ if (getTargetMachine().Options.EnableSegmentedStacks) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (Is64Bit) {
+ // The 64 bit implementation of segmented stacks needs to clobber both r10
+ // r11. This makes it impossible to use it along with nested parameters.
+ const Function *F = MF.getFunction();
+
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I)
+ if (I->hasNestAttr())
+ report_fatal_error("Cannot use segmented stacks with functions that "
+ "have nested arguments.");
+ }
+
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+ unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ SDValue Ops1[2] = { Value, Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+ } else {
+ SDValue Flag;
+ unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+
+ Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+ Flag = Chain.getValue(1);
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+ Flag = Chain.getValue(1);
+
+ Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ SPTy).getValue(1);
+
+ SDValue Ops1[2] = { Chain.getValue(0), Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+ }
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ DebugLoc DL = Op.getDebugLoc();
+
+ if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy());
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV), false, false, 0);
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDValue, 8> MemOps;
+ SDValue FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDValue Store = DAG.getStore(Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
+ MVT::i32),
+ FIN, MachinePointerInfo(SV), false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(4));
+ Store = DAG.getStore(Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
+ MVT::i32),
+ FIN, MachinePointerInfo(SV, 4), false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(4));
+ SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
+ MachinePointerInfo(SV, 8),
+ false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(8));
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
+ MachinePointerInfo(SV, 16), false, false, 0);
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ &MemOps[0], MemOps.size());
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->is64Bit() &&
+ "LowerVAARG only handles 64-bit va_arg!");
+ assert((Subtarget->isTargetLinux() ||
+ Subtarget->isTargetDarwin()) &&
+ "Unhandled target in LowerVAARG");
+ assert(Op.getNode()->getNumOperands() == 4);
+ SDValue Chain = Op.getOperand(0);
+ SDValue SrcPtr = Op.getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ unsigned Align = Op.getConstantOperandVal(3);
+ DebugLoc dl = Op.getDebugLoc();
+
+ EVT ArgVT = Op.getNode()->getValueType(0);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+ uint8_t ArgMode;
+
+ // Decide which area this value should be read from.
+ // TODO: Implement the AMD64 ABI in its entirety. This simple
+ // selection mechanism works only for the basic types.
+ if (ArgVT == MVT::f80) {
+ llvm_unreachable("va_arg for f80 not yet implemented");
+ } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+ ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
+ } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
+ } else {
+ llvm_unreachable("Unhandled argument type in LowerVAARG");
+ }
+
+ if (ArgMode == 2) {
+ // Sanity Check: Make sure using fp_offset makes sense.
+ assert(!getTargetMachine().Options.UseSoftFloat &&
+ !(DAG.getMachineFunction()
+ .getFunction()->getAttributes()
+ .hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) &&
+ Subtarget->hasSSE1());
+ }
+
+ // Insert VAARG_64 node into the DAG
+ // VAARG_64 returns two values: Variable Argument Address, Chain
+ SmallVector<SDValue, 11> InstOps;
+ InstOps.push_back(Chain);
+ InstOps.push_back(SrcPtr);
+ InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
+ InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
+ InstOps.push_back(DAG.getConstant(Align, MVT::i32));
+ SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
+ SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+ VTs, &InstOps[0], InstOps.size(),
+ MVT::i64,
+ MachinePointerInfo(SV),
+ /*Align=*/0,
+ /*Volatile=*/false,
+ /*ReadMem=*/true,
+ /*WriteMem=*/true);
+ Chain = VAARG.getValue(1);
+
+ // Load the next argument and return it
+ return DAG.getLoad(ArgVT, dl,
+ Chain,
+ VAARG,
+ MachinePointerInfo(),
+ false, false, false, 0);
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ DebugLoc DL = Op.getDebugLoc();
+
+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
+ false,
+ MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+ SDValue SrcOp, SDValue ShAmt,
+ SelectionDAG &DAG) {
+ assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+
+ if (isa<ConstantSDNode>(ShAmt)) {
+ // Constant may be a TargetConstant. Use a regular constant.
+ uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+ switch (Opc) {
+ default: llvm_unreachable("Unknown target vector shift node");
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ }
+ }
+
+ // Change opcode to non-immediate version
+ switch (Opc) {
+ default: llvm_unreachable("Unknown target vector shift node");
+ case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+ case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+ case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+ }
+
+ // Need to build a vector containing shift amount
+ // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
+ SDValue ShOps[4];
+ ShOps[0] = ShAmt;
+ ShOps[1] = DAG.getConstant(0, MVT::i32);
+ ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+ ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ // Comparison intrinsics.
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd: {
+ unsigned Opc;
+ ISD::CondCode CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse2_comilt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse2_comile_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse2_comigt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse2_comige_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse2_comineq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETNE;
+ break;
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETNE;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+ assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+ SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), Cond);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ // Arithmetic intrinsics.
+ case Intrinsic::x86_sse2_pmulu_dq:
+ case Intrinsic::x86_avx2_pmulu_dq:
+ return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ // SSE2/AVX2 sub with unsigned saturation intrinsics
+ case Intrinsic::x86_sse2_psubus_b:
+ case Intrinsic::x86_sse2_psubus_w:
+ case Intrinsic::x86_avx2_psubus_b:
+ case Intrinsic::x86_avx2_psubus_w:
+ return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ // SSE3/AVX horizontal add/sub intrinsics
+ case Intrinsic::x86_sse3_hadd_ps:
+ case Intrinsic::x86_sse3_hadd_pd:
+ case Intrinsic::x86_avx_hadd_ps_256:
+ case Intrinsic::x86_avx_hadd_pd_256:
+ case Intrinsic::x86_sse3_hsub_ps:
+ case Intrinsic::x86_sse3_hsub_pd:
+ case Intrinsic::x86_avx_hsub_ps_256:
+ case Intrinsic::x86_avx_hsub_pd_256:
+ case Intrinsic::x86_ssse3_phadd_w_128:
+ case Intrinsic::x86_ssse3_phadd_d_128:
+ case Intrinsic::x86_avx2_phadd_w:
+ case Intrinsic::x86_avx2_phadd_d:
+ case Intrinsic::x86_ssse3_phsub_w_128:
+ case Intrinsic::x86_ssse3_phsub_d_128:
+ case Intrinsic::x86_avx2_phsub_w:
+ case Intrinsic::x86_avx2_phsub_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse3_hadd_ps:
+ case Intrinsic::x86_sse3_hadd_pd:
+ case Intrinsic::x86_avx_hadd_ps_256:
+ case Intrinsic::x86_avx_hadd_pd_256:
+ Opcode = X86ISD::FHADD;
+ break;
+ case Intrinsic::x86_sse3_hsub_ps:
+ case Intrinsic::x86_sse3_hsub_pd:
+ case Intrinsic::x86_avx_hsub_ps_256:
+ case Intrinsic::x86_avx_hsub_pd_256:
+ Opcode = X86ISD::FHSUB;
+ break;
+ case Intrinsic::x86_ssse3_phadd_w_128:
+ case Intrinsic::x86_ssse3_phadd_d_128:
+ case Intrinsic::x86_avx2_phadd_w:
+ case Intrinsic::x86_avx2_phadd_d:
+ Opcode = X86ISD::HADD;
+ break;
+ case Intrinsic::x86_ssse3_phsub_w_128:
+ case Intrinsic::x86_ssse3_phsub_d_128:
+ case Intrinsic::x86_avx2_phsub_w:
+ case Intrinsic::x86_avx2_phsub_d:
+ Opcode = X86ISD::HSUB;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // SSE2/SSE41/AVX2 integer max/min intrinsics.
+ case Intrinsic::x86_sse2_pmaxu_b:
+ case Intrinsic::x86_sse41_pmaxuw:
+ case Intrinsic::x86_sse41_pmaxud:
+ case Intrinsic::x86_avx2_pmaxu_b:
+ case Intrinsic::x86_avx2_pmaxu_w:
+ case Intrinsic::x86_avx2_pmaxu_d:
+ case Intrinsic::x86_sse2_pminu_b:
+ case Intrinsic::x86_sse41_pminuw:
+ case Intrinsic::x86_sse41_pminud:
+ case Intrinsic::x86_avx2_pminu_b:
+ case Intrinsic::x86_avx2_pminu_w:
+ case Intrinsic::x86_avx2_pminu_d:
+ case Intrinsic::x86_sse41_pmaxsb:
+ case Intrinsic::x86_sse2_pmaxs_w:
+ case Intrinsic::x86_sse41_pmaxsd:
+ case Intrinsic::x86_avx2_pmaxs_b:
+ case Intrinsic::x86_avx2_pmaxs_w:
+ case Intrinsic::x86_avx2_pmaxs_d:
+ case Intrinsic::x86_sse41_pminsb:
+ case Intrinsic::x86_sse2_pmins_w:
+ case Intrinsic::x86_sse41_pminsd:
+ case Intrinsic::x86_avx2_pmins_b:
+ case Intrinsic::x86_avx2_pmins_w:
+ case Intrinsic::x86_avx2_pmins_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse2_pmaxu_b:
+ case Intrinsic::x86_sse41_pmaxuw:
+ case Intrinsic::x86_sse41_pmaxud:
+ case Intrinsic::x86_avx2_pmaxu_b:
+ case Intrinsic::x86_avx2_pmaxu_w:
+ case Intrinsic::x86_avx2_pmaxu_d:
+ Opcode = X86ISD::UMAX;
+ break;
+ case Intrinsic::x86_sse2_pminu_b:
+ case Intrinsic::x86_sse41_pminuw:
+ case Intrinsic::x86_sse41_pminud:
+ case Intrinsic::x86_avx2_pminu_b:
+ case Intrinsic::x86_avx2_pminu_w:
+ case Intrinsic::x86_avx2_pminu_d:
+ Opcode = X86ISD::UMIN;
+ break;
+ case Intrinsic::x86_sse41_pmaxsb:
+ case Intrinsic::x86_sse2_pmaxs_w:
+ case Intrinsic::x86_sse41_pmaxsd:
+ case Intrinsic::x86_avx2_pmaxs_b:
+ case Intrinsic::x86_avx2_pmaxs_w:
+ case Intrinsic::x86_avx2_pmaxs_d:
+ Opcode = X86ISD::SMAX;
+ break;
+ case Intrinsic::x86_sse41_pminsb:
+ case Intrinsic::x86_sse2_pmins_w:
+ case Intrinsic::x86_sse41_pminsd:
+ case Intrinsic::x86_avx2_pmins_b:
+ case Intrinsic::x86_avx2_pmins_w:
+ case Intrinsic::x86_avx2_pmins_d:
+ Opcode = X86ISD::SMIN;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // SSE/SSE2/AVX floating point max/min intrinsics.
+ case Intrinsic::x86_sse_max_ps:
+ case Intrinsic::x86_sse2_max_pd:
+ case Intrinsic::x86_avx_max_ps_256:
+ case Intrinsic::x86_avx_max_pd_256:
+ case Intrinsic::x86_sse_min_ps:
+ case Intrinsic::x86_sse2_min_pd:
+ case Intrinsic::x86_avx_min_ps_256:
+ case Intrinsic::x86_avx_min_pd_256: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse_max_ps:
+ case Intrinsic::x86_sse2_max_pd:
+ case Intrinsic::x86_avx_max_ps_256:
+ case Intrinsic::x86_avx_max_pd_256:
+ Opcode = X86ISD::FMAX;
+ break;
+ case Intrinsic::x86_sse_min_ps:
+ case Intrinsic::x86_sse2_min_pd:
+ case Intrinsic::x86_avx_min_ps_256:
+ case Intrinsic::x86_avx_min_pd_256:
+ Opcode = X86ISD::FMIN;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // AVX2 variable shift intrinsics
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ Opcode = ISD::SHL;
+ break;
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ Opcode = ISD::SRL;
+ break;
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ Opcode = ISD::SRA;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_ssse3_psign_b_128:
+ case Intrinsic::x86_ssse3_psign_w_128:
+ case Intrinsic::x86_ssse3_psign_d_128:
+ case Intrinsic::x86_avx2_psign_b:
+ case Intrinsic::x86_avx2_psign_w:
+ case Intrinsic::x86_avx2_psign_d:
+ return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse41_insertps:
+ return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::x86_avx_vperm2f128_ps_256:
+ case Intrinsic::x86_avx_vperm2f128_pd_256:
+ case Intrinsic::x86_avx_vperm2f128_si_256:
+ case Intrinsic::x86_avx2_vperm2i128:
+ return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ // Operands intentionally swapped. Mask is last operand to intrinsic,
+ // but second operand for node/intruction.
+ return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(1));
+
+ case Intrinsic::x86_sse_sqrt_ps:
+ case Intrinsic::x86_sse2_sqrt_pd:
+ case Intrinsic::x86_avx_sqrt_ps_256:
+ case Intrinsic::x86_avx_sqrt_pd_256:
+ return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
+
+ // ptest and testp intrinsics. The intrinsic these come from are designed to
+ // return an integer value, not just an instruction so lower it to the ptest
+ // or testp pattern and a setcc for the result.
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestz_256:
+ case Intrinsic::x86_avx_ptestc_256:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256: {
+ bool IsTestPacked = false;
+ unsigned X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ IsTestPacked = true; // Fallthrough
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_avx_ptestz_256:
+ // ZF = 1
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ IsTestPacked = true; // Fallthrough
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_avx_ptestc_256:
+ // CF = 1
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256:
+ IsTestPacked = true; // Fallthrough
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ // ZF and CF = 0
+ X86CC = X86::COND_A;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+ SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+ SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ // SSE/AVX shift intrinsics
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ Opcode = X86ISD::VSHL;
+ break;
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ Opcode = X86ISD::VSRL;
+ break;
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ Opcode = X86ISD::VSRA;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // SSE/AVX immediate shift intrinsics
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ Opcode = X86ISD::VSHLI;
+ break;
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ Opcode = X86ISD::VSRLI;
+ break;
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ Opcode = X86ISD::VSRAI;
+ break;
+ }
+ return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistria128:
+ case Intrinsic::x86_sse42_pcmpestria128:
+ case Intrinsic::x86_sse42_pcmpistric128:
+ case Intrinsic::x86_sse42_pcmpestric128:
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ case Intrinsic::x86_sse42_pcmpistris128:
+ case Intrinsic::x86_sse42_pcmpestris128:
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ case Intrinsic::x86_sse42_pcmpestriz128: {
+ unsigned Opcode;
+ unsigned X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse42_pcmpistria128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpestria128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpistric128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpestric128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpistris128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpestris128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_sse42_pcmpestriz128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_E;
+ break;
+ }
+ SmallVector<SDValue, 5> NewOps;
+ NewOps.append(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8),
+ SDValue(PCMP.getNode(), 1));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistri128:
+ case Intrinsic::x86_sse42_pcmpestri128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+ Opcode = X86ISD::PCMPISTRI;
+ else
+ Opcode = X86ISD::PCMPESTRI;
+
+ SmallVector<SDValue, 5> NewOps;
+ NewOps.append(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ }
+ case Intrinsic::x86_fma_vfmadd_ps:
+ case Intrinsic::x86_fma_vfmadd_pd:
+ case Intrinsic::x86_fma_vfmsub_ps:
+ case Intrinsic::x86_fma_vfmsub_pd:
+ case Intrinsic::x86_fma_vfnmadd_ps:
+ case Intrinsic::x86_fma_vfnmadd_pd:
+ case Intrinsic::x86_fma_vfnmsub_ps:
+ case Intrinsic::x86_fma_vfnmsub_pd:
+ case Intrinsic::x86_fma_vfmaddsub_ps:
+ case Intrinsic::x86_fma_vfmaddsub_pd:
+ case Intrinsic::x86_fma_vfmsubadd_ps:
+ case Intrinsic::x86_fma_vfmsubadd_pd:
+ case Intrinsic::x86_fma_vfmadd_ps_256:
+ case Intrinsic::x86_fma_vfmadd_pd_256:
+ case Intrinsic::x86_fma_vfmsub_ps_256:
+ case Intrinsic::x86_fma_vfmsub_pd_256:
+ case Intrinsic::x86_fma_vfnmadd_ps_256:
+ case Intrinsic::x86_fma_vfnmadd_pd_256:
+ case Intrinsic::x86_fma_vfnmsub_ps_256:
+ case Intrinsic::x86_fma_vfnmsub_pd_256:
+ case Intrinsic::x86_fma_vfmaddsub_ps_256:
+ case Intrinsic::x86_fma_vfmaddsub_pd_256:
+ case Intrinsic::x86_fma_vfmsubadd_ps_256:
+ case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_fma_vfmadd_ps:
+ case Intrinsic::x86_fma_vfmadd_pd:
+ case Intrinsic::x86_fma_vfmadd_ps_256:
+ case Intrinsic::x86_fma_vfmadd_pd_256:
+ Opc = X86ISD::FMADD;
+ break;
+ case Intrinsic::x86_fma_vfmsub_ps:
+ case Intrinsic::x86_fma_vfmsub_pd:
+ case Intrinsic::x86_fma_vfmsub_ps_256:
+ case Intrinsic::x86_fma_vfmsub_pd_256:
+ Opc = X86ISD::FMSUB;
+ break;
+ case Intrinsic::x86_fma_vfnmadd_ps:
+ case Intrinsic::x86_fma_vfnmadd_pd:
+ case Intrinsic::x86_fma_vfnmadd_ps_256:
+ case Intrinsic::x86_fma_vfnmadd_pd_256:
+ Opc = X86ISD::FNMADD;
+ break;
+ case Intrinsic::x86_fma_vfnmsub_ps:
+ case Intrinsic::x86_fma_vfnmsub_pd:
+ case Intrinsic::x86_fma_vfnmsub_ps_256:
+ case Intrinsic::x86_fma_vfnmsub_pd_256:
+ Opc = X86ISD::FNMSUB;
+ break;
+ case Intrinsic::x86_fma_vfmaddsub_ps:
+ case Intrinsic::x86_fma_vfmaddsub_pd:
+ case Intrinsic::x86_fma_vfmaddsub_ps_256:
+ case Intrinsic::x86_fma_vfmaddsub_pd_256:
+ Opc = X86ISD::FMADDSUB;
+ break;
+ case Intrinsic::x86_fma_vfmsubadd_ps:
+ case Intrinsic::x86_fma_vfmsubadd_pd:
+ case Intrinsic::x86_fma_vfmsubadd_ps_256:
+ case Intrinsic::x86_fma_vfmsubadd_pd_256:
+ Opc = X86ISD::FMSUBADD;
+ break;
+ }
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
+ }
+ }
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ // RDRAND/RDSEED intrinsics.
+ case Intrinsic::x86_rdrand_16:
+ case Intrinsic::x86_rdrand_32:
+ case Intrinsic::x86_rdrand_64:
+ case Intrinsic::x86_rdseed_16:
+ case Intrinsic::x86_rdseed_32:
+ case Intrinsic::x86_rdseed_64: {
+ unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
+ IntNo == Intrinsic::x86_rdseed_32 ||
+ IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
+ X86ISD::RDRAND;
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+ // Otherwise return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, MVT::i32),
+ SDValue(Result.getNode(), 1) };
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+ DAG.getVTList(Op->getValueType(1), MVT::Glue),
+ Ops, array_lengthof(Ops));
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
+
+ // XTEST intrinsics.
+ case Intrinsic::x86_xtest: {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_NE, MVT::i8),
+ InTrans);
+ SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+ Ret, SDValue(InTrans.getNode(), 1));
+ }
+ }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setReturnAddressIsTaken(true);
+
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ DebugLoc dl = Op.getDebugLoc();
+ EVT PtrVT = getPointerTy();
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset =
+ DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT,
+ FrameAddr, Offset),
+ MachinePointerInfo(), false, false, false, 0);
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo(),
+ false, false, false, 0);
+ return FrameAddr;
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+ SelectionDAG &DAG) const {
+ return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ DebugLoc dl = Op.getDebugLoc();
+
+ EVT PtrVT = getPointerTy();
+ unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+ (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+ unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+ SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+ DAG.getIntPtrConstant(RegInfo->getSlotSize()));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
+ false, false, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+ return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+ DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+ return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Root = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ DebugLoc dl = Op.getDebugLoc();
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+
+ if (Subtarget->is64Bit()) {
+ SDValue OutChains[6];
+
+ // Large code-model.
+ const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
+ const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+ const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+ const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+ const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+ // Load the pointer to the nested function into R11.
+ unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+ SDValue Addr = Trmp;
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr),
+ false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(2, MVT::i64));
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+ MachinePointerInfo(TrmpAddr, 2),
+ false, false, 2);
+
+ // Load the 'nest' parameter value into R10.
+ // R10 is specified in X86CallingConv.td
+ OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(10, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 10),
+ false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(12, MVT::i64));
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 12),
+ false, false, 2);
+
+ // Jump to the nested function.
+ OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(20, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 20),
+ false, false, 0);
+
+ unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(22, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+ MachinePointerInfo(TrmpAddr, 22),
+ false, false, 0);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+ } else {
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ CallingConv::ID CC = Func->getCallingConv();
+ unsigned NestReg;
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::X86_StdCall: {
+ // Pass 'nest' parameter in ECX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::ECX;
+
+ // Check that ECX wasn't needed by an 'inreg' parameter.
+ FunctionType *FTy = Func->getFunctionType();
+ const AttributeSet &Attrs = Func->getAttributes();
+
+ if (!Attrs.isEmpty() && !Func->isVarArg()) {
+ unsigned InRegCount = 0;
+ unsigned Idx = 1;
+
+ for (FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end(); I != E; ++I, ++Idx)
+ if (Attrs.hasAttribute(Idx, Attribute::InReg))
+ // FIXME: should only count parameters that are lowered to integers.
+ InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+ if (InRegCount > 2) {
+ report_fatal_error("Nest register in use - reduce number of inreg"
+ " parameters!");
+ }
+ }
+ break;
+ }
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::Fast:
+ // Pass 'nest' parameter in EAX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::EAX;
+ break;
+ }
+
+ SDValue OutChains[4];
+ SDValue Addr, Disp;
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(10, MVT::i32));
+ Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+ // This is storing the opcode for MOV32ri.
+ const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+ const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+ OutChains[0] = DAG.getStore(Root, dl,
+ DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+ Trmp, MachinePointerInfo(TrmpAddr),
+ false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(1, MVT::i32));
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 1),
+ false, false, 1);
+
+ const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(5, MVT::i32));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+ MachinePointerInfo(TrmpAddr, 5),
+ false, false, 1);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(6, MVT::i32));
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+ MachinePointerInfo(TrmpAddr, 6),
+ false, false, 1);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+ }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ /*
+ The rounding mode is in bits 11:10 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to -inf
+ 10 Round to +inf
+ 11 Round to 0
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we do:
+ (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameLowering &TFI = *TM.getFrameLowering();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ EVT VT = Op.getValueType();
+ DebugLoc DL = Op.getDebugLoc();
+
+ // Save FP Control Word to stack slot
+ int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOStore, 2, 2);
+
+ SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+ SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, array_lengthof(Ops), MVT::i16,
+ MMO);
+
+ // Load FP Control Word from stack slot
+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
+ MachinePointerInfo(), false, false, false, 0);
+
+ // Transform as necessary
+ SDValue CWD1 =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0x800, MVT::i16)),
+ DAG.getConstant(11, MVT::i8));
+ SDValue CWD2 =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0x400, MVT::i16)),
+ DAG.getConstant(9, MVT::i8));
+
+ SDValue RetVal =
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ DAG.getNode(ISD::ADD, DL, MVT::i16,
+ DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+ DAG.getConstant(1, MVT::i16)),
+ DAG.getConstant(3, MVT::i16));
+
+ return DAG.getNode((VT.getSizeInBits() < 16 ?
+ ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ EVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits+NumBits-1, OpVT),
+ DAG.getConstant(X86::COND_E, MVT::i8),
+ Op.getValue(1)
+ };
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+
+ // Finally xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ EVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse).
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // And xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+ Op = Op.getOperand(0);
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+ // If src is zero (i.e. bsf sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits, VT),
+ DAG.getConstant(X86::COND_E, MVT::i8),
+ Op.getValue(1)
+ };
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+}
+
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ assert(VT.is256BitVector() && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getValueType().is256BitVector() &&
+ Op.getValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getValueType().is256BitVector() &&
+ Op.getValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget->hasInt256())
+ return Lower256IntArith(Op, DAG);
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+ if (VT == MVT::v4i32) {
+ assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+ "Should not custom lower when pmuldq is available!");
+
+ // Extract the odd parts.
+ const int UnpackMask[] = { 1, -1, 3, -1 };
+ SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+ SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+ // Multiply the even parts.
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ // Now multiply odd parts.
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+ Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+ Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+ // Merge the two vectors back together with a shuffle. This expands into 2
+ // shuffles.
+ const int ShufMask[] = { 0, 4, 2, 6 };
+ return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+ }
+
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ "Only know how to lower V2I64/V4I64 multiply");
+
+ // Ahi = psrlqi(a, 32);
+ // Bhi = psrlqi(b, 32);
+ //
+ // AloBlo = pmuludq(a, b);
+ // AloBhi = pmuludq(a, Bhi);
+ // AhiBlo = pmuludq(Ahi, b);
+
+ // AloBhi = psllqi(AloBhi, 32);
+ // AhiBlo = psllqi(AhiBlo, 32);
+ // return AloBlo + AloBhi + AhiBlo;
+
+ SDValue ShAmt = DAG.getConstant(32, MVT::i32);
+
+ SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
+ SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+
+ // Bit cast to 32-bit vectors for MULUDQ
+ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+ A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+ B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+ Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+ Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+
+ SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+ SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+ SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+ AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
+ AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+
+ SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+ return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+}
+
+SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT EltTy = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Lower sdiv X, pow2-const.
+ BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
+ if (!C)
+ return SDValue();
+
+ APInt SplatValue, SplatUndef;
+ unsigned MinSplatBits;
+ bool HasAnyUndefs;
+ if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+ return SDValue();
+
+ if ((SplatValue != 0) &&
+ (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+ unsigned lg2 = SplatValue.countTrailingZeros();
+ // Splat the sign bit.
+ SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+ SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+ // Add (N0 < 0) ? abs2 - 1 : 0;
+ SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+ SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+ SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+ SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+ SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (SplatValue.isNonNegative())
+ return SRA;
+
+ SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
+ SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
+ return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+ }
+ return SDValue();
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ // Optimize shl/srl/sra with constant shift amount.
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+ uint64_t ShiftAmt = C->getZExtValue();
+
+ if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+ (Subtarget->hasInt256() &&
+ (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ if (Op.getOpcode() == ISD::SHL)
+ return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ if (Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
+ return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ }
+
+ if (VT == MVT::v16i8) {
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+ // Zero out the rightmost bits.
+ SmallVector<SDValue, 16> V(16,
+ DAG.getConstant(uint8_t(-1U << ShiftAmt),
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+ // Zero out the leftmost bits.
+ SmallVector<SDValue, 16> V(16,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ }
+ if (Op.getOpcode() == ISD::SRA) {
+ if (ShiftAmt == 7) {
+ // R s>> 7 === R s< 0
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+
+ // R s>> a === ((R u>> a) ^ m) - m
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+ MVT::i8));
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+ llvm_unreachable("Unknown shift opcode.");
+ }
+
+ if (Subtarget->hasInt256() && VT == MVT::v32i8) {
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+ // Zero out the rightmost bits.
+ SmallVector<SDValue, 32> V(32,
+ DAG.getConstant(uint8_t(-1U << ShiftAmt),
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+ // Zero out the leftmost bits.
+ SmallVector<SDValue, 32> V(32,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ }
+ if (Op.getOpcode() == ISD::SRA) {
+ if (ShiftAmt == 7) {
+ // R s>> 7 === R s< 0
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+
+ // R s>> a === ((R u>> a) ^ m) - m
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
+ MVT::i8));
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+ llvm_unreachable("Unknown shift opcode.");
+ }
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget->is64Bit() &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+ uint64_t ShiftAmt = 0;
+ for (unsigned i = 0; i != Ratio; ++i) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
+ if (C == 0)
+ return SDValue();
+ // 6 == Log2(64)
+ ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+ }
+ // Check remaining shift amounts.
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ uint64_t ShAmt = 0;
+ for (unsigned j = 0; j != Ratio; ++j) {
+ ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+ if (C == 0)
+ return SDValue();
+ // 6 == Log2(64)
+ ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+ }
+ if (ShAmt != ShiftAmt)
+ return SDValue();
+ }
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ case ISD::SHL:
+ return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ case ISD::SRL:
+ return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ case ISD::SRA:
+ return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
+ VT == MVT::v4i32 || VT == MVT::v8i16 ||
+ (Subtarget->hasInt256() &&
+ ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
+ VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ SDValue BaseShAmt;
+ EVT EltVT = VT.getVectorElementType();
+
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned i, j;
+ for (i = 0; i != NumElts; ++i) {
+ if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
+ continue;
+ break;
+ }
+ for (j = i; j != NumElts; ++j) {
+ SDValue Arg = Amt.getOperand(j);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ if (Arg != Amt.getOperand(i))
+ break;
+ }
+ if (i != NumElts && j == NumElts)
+ BaseShAmt = Amt.getOperand(i);
+ } else {
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ Amt = Amt.getOperand(0);
+ if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+ SDValue InVec = Amt.getOperand(0);
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = InVec.getValueType().getVectorNumElements();
+ unsigned i = 0;
+ for (; i != NumElts; ++i) {
+ SDValue Arg = InVec.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ BaseShAmt = Arg;
+ break;
+ }
+ } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+ unsigned SplatIdx =
+ cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
+ if (C->getZExtValue() == SplatIdx)
+ BaseShAmt = InVec.getOperand(1);
+ }
+ }
+ if (BaseShAmt.getNode() == 0)
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
+ DAG.getIntPtrConstant(0));
+ }
+ }
+
+ if (BaseShAmt.getNode()) {
+ if (EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ case ISD::SHL:
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v2i64:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
+ }
+ case ISD::SRA:
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
+ }
+ case ISD::SRL:
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v2i64:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
+ }
+ }
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget->is64Bit() &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ std::vector<SDValue> Vals(Ratio);
+ for (unsigned i = 0; i != Ratio; ++i)
+ Vals[i] = Amt.getOperand(i);
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ for (unsigned j = 0; j != Ratio; ++j)
+ if (Vals[j] != Amt.getOperand(i + j))
+ return SDValue();
+ }
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ case ISD::SHL:
+ return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
+ case ISD::SRL:
+ return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
+ case ISD::SRA:
+ return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ SDValue V;
+
+ if (!Subtarget->hasSSE2())
+ return SDValue();
+
+ V = LowerScalarImmediateShift(Op, DAG, Subtarget);
+ if (V.getNode())
+ return V;
+
+ V = LowerScalarVariableShift(Op, DAG, Subtarget);
+ if (V.getNode())
+ return V;
+
+ // AVX2 has VPSLLV/VPSRAV/VPSRLV.
+ if (Subtarget->hasInt256()) {
+ if (Op.getOpcode() == ISD::SRL &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v4i64 || VT == MVT::v8i32))
+ return Op;
+ if (Op.getOpcode() == ISD::SHL &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v4i64 || VT == MVT::v8i32))
+ return Op;
+ if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
+ return Op;
+ }
+
+ // Lower SHL with variable shift amount.
+ if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
+
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+ Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+ return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+ }
+ if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+ assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
+
+ // a = a << 5;
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
+ Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+ // Turn 'a' into a mask suitable for VSELECT
+ SDValue VSelM = DAG.getConstant(0x80, VT);
+ SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+ SDValue CM1 = DAG.getConstant(0x0f, VT);
+ SDValue CM2 = DAG.getConstant(0x3f, VT);
+
+ // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+ SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
+ M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+ DAG.getConstant(4, MVT::i32), DAG);
+ M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+ // a += a
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+ OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+ // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+ M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
+ M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+ DAG.getConstant(2, MVT::i32), DAG);
+ M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+ // a += a
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+ OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+ // return VSELECT(r, r+r, a);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
+ DAG.getNode(ISD::ADD, dl, VT, R, R), R);
+ return R;
+ }
+
+ // Decompose 256-bit shifts into smaller 128-bit shifts.
+ if (VT.is256BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ // Extract the two vectors
+ SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+ SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
+
+ // Recreate the shift amount vectors
+ SDValue Amt1, Amt2;
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ // Constant shift amount
+ SmallVector<SDValue, 4> Amt1Csts;
+ SmallVector<SDValue, 4> Amt2Csts;
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ Amt1Csts.push_back(Amt->getOperand(i));
+ for (unsigned i = NumElems/2; i != NumElems; ++i)
+ Amt2Csts.push_back(Amt->getOperand(i));
+
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt1Csts[0], NumElems/2);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt2Csts[0], NumElems/2);
+ } else {
+ // Variable shift amount
+ Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+ Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
+ }
+
+ // Issue new vector shifts for the smaller types
+ V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+ V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+ // Concatenate the result back
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ unsigned BaseOp = 0;
+ unsigned Cond = 0;
+ DebugLoc DL = Op.getDebugLoc();
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown ovf instruction!");
+ case ISD::SADDO:
+ // A subtract of one will be selected as a INC. Note that INC doesn't
+ // set CF, so we can't do this for UADDO.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+ if (C->isOne()) {
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SSUBO:
+ // A subtract of one will be selected as a DEC. Note that DEC doesn't
+ // set CF, so we can't do this for USUBO.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+ if (C->isOne()) {
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_O;
+ break;
+ case ISD::USUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SMULO:
+ BaseOp = X86ISD::SMUL;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+ MVT::i32);
+ SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+ SDValue SetCC =
+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(X86::COND_O, MVT::i32),
+ SDValue(Sum.getNode(), 2));
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+ }
+ }
+
+ // Also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+ SDValue SetCC =
+ DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+ DAG.getConstant(Cond, MVT::i32),
+ SDValue(Sum.getNode(), 1));
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ EVT VT = Op.getValueType();
+
+ if (!Subtarget->hasSSE2() || !VT.isVector())
+ return SDValue();
+
+ unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+ ExtraVT.getScalarType().getSizeInBits();
+ SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v8i32:
+ case MVT::v16i16:
+ if (!Subtarget->hasFp256())
+ return SDValue();
+ if (!Subtarget->hasInt256()) {
+ // needs to be split
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ EVT ExtraEltVT = ExtraVT.getVectorElementType();
+ unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
+ ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+ ExtraNumElems/2);
+ SDValue Extra = DAG.getValueType(ExtraVT);
+
+ LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+ LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
+ }
+ // fall through
+ case MVT::v4i32:
+ case MVT::v8i16: {
+ // (sext (vzext x)) -> (vsext x)
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op00 = Op0.getOperand(0);
+ SDValue Tmp1;
+ // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
+ Tmp1 = LowerVectorIntExtend(Op00, DAG);
+ if (Tmp1.getNode()) {
+ SDValue Tmp1Op0 = Tmp1.getOperand(0);
+ assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+ "This optimization is invalid without a VZEXT.");
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+ }
+
+ // If the above didn't work, then just use Shift-Left + Shift-Right.
+ Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
+ return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+ }
+ }
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+ // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ // no-sse2). There isn't any reason to disable it if the target processor
+ // supports it.
+ if (Subtarget->hasSSE2() || Subtarget->is64Bit())
+ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(0, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i32), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ EVT T = Op.getValueType();
+ DebugLoc DL = Op.getDebugLoc();
+ unsigned Reg = 0;
+ unsigned size = 0;
+ switch(T.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid value type!");
+ case MVT::i8: Reg = X86::AL; size = 1; break;
+ case MVT::i16: Reg = X86::AX; size = 2; break;
+ case MVT::i32: Reg = X86::EAX; size = 4; break;
+ case MVT::i64:
+ assert(Subtarget->is64Bit() && "Node not type legal!");
+ Reg = X86::RAX; size = 8;
+ break;
+ }
+ SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+ Op.getOperand(2), SDValue());
+ SDValue Ops[] = { cpIn.getValue(0),
+ Op.getOperand(1),
+ Op.getOperand(3),
+ DAG.getTargetConstant(size, MVT::i8),
+ cpIn.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+ Ops, array_lengthof(Ops), T, MMO);
+ SDValue cpOut =
+ DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+ return cpOut;
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->is64Bit() && "Result not type legalized?");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue TheChain = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+ SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
+ SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
+ rax.getValue(2));
+ SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
+ DAG.getConstant(32, MVT::i8));
+ SDValue Ops[] = {
+ DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
+ rdx.getValue(1)
+ };
+ return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+}
+
+SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ EVT DstVT = Op.getValueType();
+ assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+ Subtarget->hasMMX() && "Unexpected custom BITCAST");
+ assert((DstVT == MVT::i64 ||
+ (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+ "Unexpected custom BITCAST");
+ // i64 <=> MMX conversions are Legal.
+ if (SrcVT==MVT::i64 && DstVT.isVector())
+ return Op;
+ if (DstVT==MVT::i64 && SrcVT.isVector())
+ return Op;
+ // MMX <=> MMX conversions are Legal.
+ if (SrcVT.isVector() && DstVT.isVector())
+ return Op;
+ // All other conversions need to be expanded.
+ return SDValue();
+}
+
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ EVT T = Node->getValueType(0);
+ SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+ DAG.getConstant(0, T), Node->getOperand(2));
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), negOp,
+ cast<AtomicSDNode>(Node)->getSrcValue(),
+ cast<AtomicSDNode>(Node)->getAlignment(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert seq_cst store -> xchg
+ // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+ // FIXME: On 32-bit, store -> fist or movq would be more efficient
+ // (The only way to get a 16-byte store is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+ if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ return Swap.getValue(1);
+ }
+ // Other atomic stores have a simple pattern.
+ return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getNode()->getValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Invalid code");
+ case ISD::ADDC: Opc = X86ISD::ADD; break;
+ case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+ case ISD::SUBC: Opc = X86ISD::SUB; break;
+ case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+ Op.getOperand(1), Op.getOperand(2));
+}
+
+SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+ // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values as { float, float } (in XMM0) or
+ // { double, double } (which is returned in XMM0, XMM1).
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ ArgListTy Args;
+ ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ bool isF64 = ArgVT == MVT::f64;
+ // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+ // the small struct {f32, f32} is returned in (eax, edx). For f64,
+ // the results are returned via SRet in memory.
+ const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
+ SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+ Type *RetTy = isF64
+ ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
+ : (Type*)VectorType::get(ArgTy, 4);
+ TargetLowering::
+ CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
+ false, false, false, false, 0,
+ CallingConv::C, /*isTaillCall=*/false,
+ /*doesNotRet=*/false, /*isReturnValueUsed*/true,
+ Callee, Args, DAG, dl);
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+ if (isF64)
+ // Returned in xmm0 and xmm1.
+ return CallResult.first;
+
+ // Returned in bits 0:31 and 32:64 xmm0.
+ SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(0));
+ SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(1));
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Should not custom lower this!");
+ case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+ case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+ case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FABS: return LowerFABS(Op, DAG);
+ case ISD::FNEG: return LowerFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::CTLZ: return LowerCTLZ(Op, DAG);
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
+ case ISD::CTTZ: return LowerCTTZ(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL: return LowerShift(Op, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+ case ISD::BITCAST: return LowerBITCAST(Op, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ADD: return LowerADD(Op, DAG);
+ case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::SDIV: return LowerSDIV(Op, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
+ }
+}
+
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) {
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert wide load -> cmpxchg8b/cmpxchg16b
+ // FIXME: On 32-bit, load -> fild or movq would be more efficient
+ // (The only way to get a 16-byte load is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+ Node->getOperand(0),
+ Node->getOperand(1), Zero, Zero,
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ Results.push_back(Swap.getValue(0));
+ Results.push_back(Swap.getValue(1));
+}
+
+static void
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG, unsigned NewOp) {
+ DebugLoc dl = Node->getDebugLoc();
+ assert (Node->getValueType(0) == MVT::i64 &&
+ "Only know how to expand i64 atomics");
+
+ SDValue Chain = Node->getOperand(0);
+ SDValue In1 = Node->getOperand(1);
+ SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Node->getOperand(2), DAG.getIntPtrConstant(0));
+ SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Node->getOperand(2), DAG.getIntPtrConstant(1));
+ SDValue Ops[] = { Chain, In1, In2L, In2H };
+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ SDValue Result =
+ DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+ cast<MemSDNode>(Node)->getMemOperand());
+ SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(Result.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = N->getDebugLoc();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ // We don't want to expand or promote these.
+ return;
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: {
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+ if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
+ return;
+
+ std::pair<SDValue,SDValue> Vals =
+ FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ if (FIST.getNode() != 0) {
+ EVT VT = N->getValueType(0);
+ // Return a load from the stack slot.
+ if (StackSlot.getNode() != 0)
+ Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+ MachinePointerInfo(),
+ false, false, false, 0));
+ else
+ Results.push_back(FIST);
+ }
+ return;
+ }
+ case ISD::UINT_TO_FP: {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ if (N->getOperand(0).getValueType() != MVT::v2i32 ||
+ N->getValueType(0) != MVT::v2f32)
+ return;
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+ N->getOperand(0));
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ MVT::f64);
+ SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+ Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ return;
+ }
+ case ISD::FP_ROUND: {
+ if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+ return;
+ SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ Results.push_back(V);
+ return;
+ }
+ case ISD::READCYCLECOUNTER: {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue TheChain = N->getOperand(0);
+ SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+ SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
+ rd.getValue(1));
+ SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
+ eax.getValue(2));
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { eax, edx };
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
+ array_lengthof(Ops)));
+ Results.push_back(edx.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_CMP_SWAP: {
+ EVT T = N->getValueType(0);
+ assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+ bool Regs64bit = T == MVT::i128;
+ EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+ SDValue cpInL, cpInH;
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(0, HalfT));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(1, HalfT));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ cpInH, cpInL.getValue(1));
+ SDValue swapInL, swapInH;
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(0, HalfT));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(1, HalfT));
+ swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX,
+ swapInL, cpInH.getValue(1));
+ swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+ Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, swapInL.getValue(1));
+ SDValue Ops[] = { swapInH.getValue(0),
+ N->getOperand(1),
+ swapInH.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+ unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+ X86ISD::LCMPXCHG8_DAG;
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
+ Ops, array_lengthof(Ops), T, MMO);
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ HalfT, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ HalfT, cpOutL.getValue(2));
+ SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+ Results.push_back(cpOutH.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_SWAP: {
+ unsigned Opc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::ATOMIC_LOAD_ADD:
+ Opc = X86ISD::ATOMADD64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ Opc = X86ISD::ATOMAND64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_NAND:
+ Opc = X86ISD::ATOMNAND64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_OR:
+ Opc = X86ISD::ATOMOR64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_SUB:
+ Opc = X86ISD::ATOMSUB64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ Opc = X86ISD::ATOMXOR64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_MAX:
+ Opc = X86ISD::ATOMMAX64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_MIN:
+ Opc = X86ISD::ATOMMIN64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_UMAX:
+ Opc = X86ISD::ATOMUMAX64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_UMIN:
+ Opc = X86ISD::ATOMUMIN64_DAG;
+ break;
+ case ISD::ATOMIC_SWAP:
+ Opc = X86ISD::ATOMSWAP64_DAG;
+ break;
+ }
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
+ return;
+ }
+ case ISD::ATOMIC_LOAD:
+ ReplaceATOMIC_LOAD(N, Results, DAG);
+ }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ default: return NULL;
+ case X86ISD::BSF: return "X86ISD::BSF";
+ case X86ISD::BSR: return "X86ISD::BSR";
+ case X86ISD::SHLD: return "X86ISD::SHLD";
+ case X86ISD::SHRD: return "X86ISD::SHRD";
+ case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FOR: return "X86ISD::FOR";
+ case X86ISD::FXOR: return "X86ISD::FXOR";
+ case X86ISD::FSRL: return "X86ISD::FSRL";
+ case X86ISD::FILD: return "X86ISD::FILD";
+ case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
+ case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+ case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+ case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FLD: return "X86ISD::FLD";
+ case X86ISD::FST: return "X86ISD::FST";
+ case X86ISD::CALL: return "X86ISD::CALL";
+ case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::BT: return "X86ISD::BT";
+ case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::COMI: return "X86ISD::COMI";
+ case X86ISD::UCOMI: return "X86ISD::UCOMI";
+ case X86ISD::SETCC: return "X86ISD::SETCC";
+ case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
+ case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd";
+ case X86ISD::FSETCCss: return "X86ISD::FSETCCss";
+ case X86ISD::CMOV: return "X86ISD::CMOV";
+ case X86ISD::BRCOND: return "X86ISD::BRCOND";
+ case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
+ case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
+ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
+ case X86ISD::Wrapper: return "X86ISD::Wrapper";
+ case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
+ case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
+ case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
+ case X86ISD::PINSRB: return "X86ISD::PINSRB";
+ case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
+ case X86ISD::ANDNP: return "X86ISD::ANDNP";
+ case X86ISD::PSIGN: return "X86ISD::PSIGN";
+ case X86ISD::BLENDV: return "X86ISD::BLENDV";
+ case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SUBUS: return "X86ISD::SUBUS";
+ case X86ISD::HADD: return "X86ISD::HADD";
+ case X86ISD::HSUB: return "X86ISD::HSUB";
+ case X86ISD::FHADD: return "X86ISD::FHADD";
+ case X86ISD::FHSUB: return "X86ISD::FHSUB";
+ case X86ISD::UMAX: return "X86ISD::UMAX";
+ case X86ISD::UMIN: return "X86ISD::UMIN";
+ case X86ISD::SMAX: return "X86ISD::SMAX";
+ case X86ISD::SMIN: return "X86ISD::SMIN";
+ case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMAXC: return "X86ISD::FMAXC";
+ case X86ISD::FMINC: return "X86ISD::FMINC";
+ case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
+ case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
+ case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
+ case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
+ case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
+ case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
+ case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
+ case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
+ case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
+ case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
+ case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
+ case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
+ case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
+ case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
+ case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
+ case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL";
+ case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VZEXT: return "X86ISD::VZEXT";
+ case X86ISD::VSEXT: return "X86ISD::VSEXT";
+ case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
+ case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
+ case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
+ case X86ISD::VSHL: return "X86ISD::VSHL";
+ case X86ISD::VSRL: return "X86ISD::VSRL";
+ case X86ISD::VSRA: return "X86ISD::VSRA";
+ case X86ISD::VSHLI: return "X86ISD::VSHLI";
+ case X86ISD::VSRLI: return "X86ISD::VSRLI";
+ case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::CMPP: return "X86ISD::CMPP";
+ case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
+ case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
+ case X86ISD::ADD: return "X86ISD::ADD";
+ case X86ISD::SUB: return "X86ISD::SUB";
+ case X86ISD::ADC: return "X86ISD::ADC";
+ case X86ISD::SBB: return "X86ISD::SBB";
+ case X86ISD::SMUL: return "X86ISD::SMUL";
+ case X86ISD::UMUL: return "X86ISD::UMUL";
+ case X86ISD::INC: return "X86ISD::INC";
+ case X86ISD::DEC: return "X86ISD::DEC";
+ case X86ISD::OR: return "X86ISD::OR";
+ case X86ISD::XOR: return "X86ISD::XOR";
+ case X86ISD::AND: return "X86ISD::AND";
+ case X86ISD::BLSI: return "X86ISD::BLSI";
+ case X86ISD::BLSMSK: return "X86ISD::BLSMSK";
+ case X86ISD::BLSR: return "X86ISD::BLSR";
+ case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
+ case X86ISD::PTEST: return "X86ISD::PTEST";
+ case X86ISD::TESTP: return "X86ISD::TESTP";
+ case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
+ case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
+ case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
+ case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
+ case X86ISD::SHUFP: return "X86ISD::SHUFP";
+ case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
+ case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
+ case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
+ case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
+ case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
+ case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
+ case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
+ case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
+ case X86ISD::MOVSD: return "X86ISD::MOVSD";
+ case X86ISD::MOVSS: return "X86ISD::MOVSS";
+ case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
+ case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
+ case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
+ case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
+ case X86ISD::VPERMV: return "X86ISD::VPERMV";
+ case X86ISD::VPERMI: return "X86ISD::VPERMI";
+ case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
+ case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+ case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
+ case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
+ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
+ case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
+ case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
+ case X86ISD::SAHF: return "X86ISD::SAHF";
+ case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::FMADD: return "X86ISD::FMADD";
+ case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
+ case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
+ case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
+ case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
+ case X86ISD::XTEST: return "X86ISD::XTEST";
+ }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ Type *Ty) const {
+ // X86 supports extremely general addressing modes.
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ Reloc::Model R = getTargetMachine().getRelocationModel();
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+ return false;
+
+ if (AM.BaseGV) {
+ unsigned GVFlags =
+ Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+ // If a reference to this global requires an extra load, we can't fold it.
+ if (isGlobalStubReference(GVFlags))
+ return false;
+
+ // If BaseGV requires a register for the PIC base, we cannot also have a
+ // BaseReg specified.
+ if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+ return false;
+
+ // If lower 4G is not available, then we must use rip-relative addressing.
+ if ((M != CodeModel::Small || R != Reloc::Static) &&
+ Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+ return false;
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ EVT VT1 = Val.getValueType();
+ if (isZExtFree(VT1, VT2))
+ return true;
+
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ // X86 has 8, 16, and 32-bit zero-extending loads.
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+ // i16 instructions are longer (0x66 prefix) and potentially slower.
+ return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ // Very little shuffling can be done for 64-bit vectors right now.
+ if (VT.getSizeInBits() == 64)
+ return false;
+
+ // FIXME: pshufb, blends, shifts.
+ return (VT.getVectorNumElements() == 2 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isMOVLMask(M, VT) ||
+ isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
+ isPSHUFDMask(M, VT) ||
+ isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
+ isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
+ isPALIGNRMask(M, VT, Subtarget) ||
+ isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
+ isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
+ isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
+ isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const {
+ unsigned NumElts = VT.getVectorNumElements();
+ // FIXME: This collection of masks seems suspect.
+ if (NumElts == 2)
+ return true;
+ if (NumElts == 4 && VT.is128BitVector()) {
+ return (isMOVLMask(Mask, VT) ||
+ isCommutedMOVLMask(Mask, VT, true) ||
+ isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
+ isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
+ }
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ DebugLoc DL = MI->getDebugLoc();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ // For the v = xbegin(), we generate
+ //
+ // thisMBB:
+ // xbegin sinkMBB
+ //
+ // mainMBB:
+ // eax = -1
+ //
+ // sinkMBB:
+ // v = eax
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ // xbegin sinkMBB
+ // # fallthrough to mainMBB
+ // # abortion to sinkMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(sinkMBB);
+
+ // mainMBB:
+ // EAX = -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // EAX is live into the sinkMBB
+ sinkMBB->addLiveIn(X86::EAX);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::EAX);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+// Get CMPXCHG opcode for the specified data type.
+static unsigned getCmpXChgOpcode(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8: return X86::LCMPXCHG8;
+ case MVT::i16: return X86::LCMPXCHG16;
+ case MVT::i32: return X86::LCMPXCHG32;
+ case MVT::i64: return X86::LCMPXCHG64;
+ default:
+ break;
+ }
+ llvm_unreachable("Invalid operand size!");
+}
+
+// Get LOAD opcode for the specified data type.
+static unsigned getLoadOpcode(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8: return X86::MOV8rm;
+ case MVT::i16: return X86::MOV16rm;
+ case MVT::i32: return X86::MOV32rm;
+ case MVT::i64: return X86::MOV64rm;
+ default:
+ break;
+ }
+ llvm_unreachable("Invalid operand size!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction.
+static unsigned getNonAtomicOpcode(unsigned Opc) {
+ switch (Opc) {
+ case X86::ATOMAND8: return X86::AND8rr;
+ case X86::ATOMAND16: return X86::AND16rr;
+ case X86::ATOMAND32: return X86::AND32rr;
+ case X86::ATOMAND64: return X86::AND64rr;
+ case X86::ATOMOR8: return X86::OR8rr;
+ case X86::ATOMOR16: return X86::OR16rr;
+ case X86::ATOMOR32: return X86::OR32rr;
+ case X86::ATOMOR64: return X86::OR64rr;
+ case X86::ATOMXOR8: return X86::XOR8rr;
+ case X86::ATOMXOR16: return X86::XOR16rr;
+ case X86::ATOMXOR32: return X86::XOR32rr;
+ case X86::ATOMXOR64: return X86::XOR64rr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction with
+// extra opcode.
+static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
+ unsigned &ExtraOpc) {
+ switch (Opc) {
+ case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr;
+ case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr;
+ case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr;
+ case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr;
+ case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr;
+ case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
+ case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
+ case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
+ case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr;
+ case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
+ case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
+ case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
+ case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr;
+ case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
+ case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
+ case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
+ case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr;
+ case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
+ case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
+ case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target.
+static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
+ switch (Opc) {
+ case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr;
+ case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr;
+ case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr;
+ case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr;
+ case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr;
+ case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
+ case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr;
+ case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr;
+ case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr;
+ case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target with extra opcode.
+static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
+ unsigned &HiOpc,
+ unsigned &ExtraOpc) {
+ switch (Opc) {
+ case X86::ATOMNAND6432:
+ ExtraOpc = X86::NOT32r;
+ HiOpc = X86::AND32rr;
+ return X86::AND32rr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get pseudo CMOV opcode from the specified data type.
+static unsigned getPseudoCMOVOpc(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8: return X86::CMOV_GR8;
+ case MVT::i16: return X86::CMOV_GR16;
+ case MVT::i32: return X86::CMOV_GR32;
+ default:
+ break;
+ }
+ llvm_unreachable("Unknown CMOV opcode!");
+}
+
+// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
+// They will be translated into a spin-loop or compare-exchange loop from
+//
+// ...
+// dst = atomic-fetch-op MI.addr, MI.val
+// ...
+//
+// to
+//
+// ...
+// t1 = LOAD MI.addr
+// loop:
+// t4 = phi(t1, t3 / loop)
+// t2 = OP MI.val, t4
+// EAX = t4
+// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
+// t3 = EAX
+// JNE loop
+// sink:
+// dst = t3
+// ...
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+ "Unexpected number of operands");
+
+ assert(MI->hasOneMemOperand() &&
+ "Expected atomic-load-op to have one memoperand");
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstReg, SrcReg;
+ unsigned MemOpndSlot;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI->getOperand(CurOp++).getReg();
+ MemOpndSlot = CurOp;
+ CurOp += X86::AddrNumOperands;
+ SrcReg = MI->getOperand(CurOp++).getReg();
+
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ MVT::SimpleValueType VT = *RC->vt_begin();
+ unsigned t1 = MRI.createVirtualRegister(RC);
+ unsigned t2 = MRI.createVirtualRegister(RC);
+ unsigned t3 = MRI.createVirtualRegister(RC);
+ unsigned t4 = MRI.createVirtualRegister(RC);
+ unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
+
+ unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
+ unsigned LOADOpc = getLoadOpcode(VT);
+
+ // For the atomic load-arith operator, we generate
+ //
+ // thisMBB:
+ // t1 = LOAD [MI.addr]
+ // mainMBB:
+ // t4 = phi(t1 / thisMBB, t3 / mainMBB)
+ // t1 = OP MI.val, EAX
+ // EAX = t4
+ // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+ // t3 = EAX
+ // JNE mainMBB
+ // sinkMBB:
+ // dst = t3
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+ unsigned flags = (*MMOI)->getFlags();
+ flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+ MachineMemOperand *MMO =
+ MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+ (*MMOI)->getSize(),
+ (*MMOI)->getBaseAlignment(),
+ (*MMOI)->getTBAAInfo(),
+ (*MMOI)->getRanges());
+ MIB.addMemOperand(MMO);
+ }
+
+ thisMBB->addSuccessor(mainMBB);
+
+ // mainMBB:
+ MachineBasicBlock *origMainMBB = mainMBB;
+
+ // Add a PHI.
+ MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
+ .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
+
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+ case X86::ATOMAND8:
+ case X86::ATOMAND16:
+ case X86::ATOMAND32:
+ case X86::ATOMAND64:
+ case X86::ATOMOR8:
+ case X86::ATOMOR16:
+ case X86::ATOMOR32:
+ case X86::ATOMOR64:
+ case X86::ATOMXOR8:
+ case X86::ATOMXOR16:
+ case X86::ATOMXOR32:
+ case X86::ATOMXOR64: {
+ unsigned ARITHOpc = getNonAtomicOpcode(Opc);
+ BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
+ .addReg(t4);
+ break;
+ }
+ case X86::ATOMNAND8:
+ case X86::ATOMNAND16:
+ case X86::ATOMNAND32:
+ case X86::ATOMNAND64: {
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ unsigned NOTOpc;
+ unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
+ BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
+ .addReg(t4);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
+ break;
+ }
+ case X86::ATOMMAX8:
+ case X86::ATOMMAX16:
+ case X86::ATOMMAX32:
+ case X86::ATOMMAX64:
+ case X86::ATOMMIN8:
+ case X86::ATOMMIN16:
+ case X86::ATOMMIN32:
+ case X86::ATOMMIN64:
+ case X86::ATOMUMAX8:
+ case X86::ATOMUMAX16:
+ case X86::ATOMUMAX32:
+ case X86::ATOMUMAX64:
+ case X86::ATOMUMIN8:
+ case X86::ATOMUMIN16:
+ case X86::ATOMUMIN32:
+ case X86::ATOMUMIN64: {
+ unsigned CMPOpc;
+ unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
+
+ BuildMI(mainMBB, DL, TII->get(CMPOpc))
+ .addReg(SrcReg)
+ .addReg(t4);
+
+ if (Subtarget->hasCMov()) {
+ if (VT != MVT::i8) {
+ // Native support
+ BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+ .addReg(SrcReg)
+ .addReg(t4);
+ } else {
+ // Promote i8 to i32 to use CMOV32
+ const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterClass *RC32 =
+ TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
+ unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
+ unsigned AccReg32 = MRI.createVirtualRegister(RC32);
+ unsigned Tmp = MRI.createVirtualRegister(RC32);
+
+ unsigned Undef = MRI.createVirtualRegister(RC32);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
+
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
+ .addReg(Undef)
+ .addReg(SrcReg)
+ .addImm(X86::sub_8bit);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
+ .addReg(Undef)
+ .addReg(t4)
+ .addImm(X86::sub_8bit);
+
+ BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
+ .addReg(SrcReg32)
+ .addReg(AccReg32);
+
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
+ .addReg(Tmp, 0, X86::sub_8bit);
+ }
+ } else {
+ // Use pseudo select and lower them.
+ assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+ "Invalid atomic-load-op transformation!");
+ unsigned SelOpc = getPseudoCMOVOpc(VT);
+ X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
+ assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
+ MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
+ .addReg(SrcReg).addReg(t4)
+ .addImm(CC);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ // Replace the original PHI node as mainMBB is changed after CMOV
+ // lowering.
+ BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
+ .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
+ Phi->eraseFromParent();
+ }
+ break;
+ }
+ }
+
+ // Copy PhyReg back from virtual register.
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
+ .addReg(t4);
+
+ MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ MIB.addReg(t2);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Copy PhyReg back to virtual register.
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
+ .addReg(PhyReg);
+
+ BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
+
+ mainMBB->addSuccessor(origMainMBB);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), DstReg)
+ .addReg(t3);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
+// instructions. They will be translated into a spin-loop or compare-exchange
+// loop from
+//
+// ...
+// dst = atomic-fetch-op MI.addr, MI.val
+// ...
+//
+// to
+//
+// ...
+// t1L = LOAD [MI.addr + 0]
+// t1H = LOAD [MI.addr + 4]
+// loop:
+// t4L = phi(t1L, t3L / loop)
+// t4H = phi(t1H, t3H / loop)
+// t2L = OP MI.val.lo, t4L
+// t2H = OP MI.val.hi, t4H
+// EAX = t4L
+// EDX = t4H
+// EBX = t2L
+// ECX = t2H
+// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+// t3L = EAX
+// t3H = EDX
+// JNE loop
+// sink:
+// dstL = t3L
+// dstH = t3H
+// ...
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
+ "Unexpected number of operands");
+
+ assert(MI->hasOneMemOperand() &&
+ "Expected atomic-load-op32 to have one memoperand");
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstLoReg, DstHiReg;
+ unsigned SrcLoReg, SrcHiReg;
+ unsigned MemOpndSlot;
+
+ unsigned CurOp = 0;
+
+ DstLoReg = MI->getOperand(CurOp++).getReg();
+ DstHiReg = MI->getOperand(CurOp++).getReg();
+ MemOpndSlot = CurOp;
+ CurOp += X86::AddrNumOperands;
+ SrcLoReg = MI->getOperand(CurOp++).getReg();
+ SrcHiReg = MI->getOperand(CurOp++).getReg();
+
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
+ const TargetRegisterClass *RC8 = &X86::GR8RegClass;
+
+ unsigned t1L = MRI.createVirtualRegister(RC);
+ unsigned t1H = MRI.createVirtualRegister(RC);
+ unsigned t2L = MRI.createVirtualRegister(RC);
+ unsigned t2H = MRI.createVirtualRegister(RC);
+ unsigned t3L = MRI.createVirtualRegister(RC);
+ unsigned t3H = MRI.createVirtualRegister(RC);
+ unsigned t4L = MRI.createVirtualRegister(RC);
+ unsigned t4H = MRI.createVirtualRegister(RC);
+
+ unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
+ unsigned LOADOpc = X86::MOV32rm;
+
+ // For the atomic load-arith operator, we generate
+ //
+ // thisMBB:
+ // t1L = LOAD [MI.addr + 0]
+ // t1H = LOAD [MI.addr + 4]
+ // mainMBB:
+ // t4L = phi(t1L / thisMBB, t3L / mainMBB)
+ // t4H = phi(t1H / thisMBB, t3H / mainMBB)
+ // t2L = OP MI.val.lo, t4L
+ // t2H = OP MI.val.hi, t4H
+ // EBX = t2L
+ // ECX = t2H
+ // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+ // t3L = EAX
+ // t3H = EDX
+ // JNE loop
+ // sinkMBB:
+ // dstL = t3L
+ // dstH = t3H
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ // Lo
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+ unsigned flags = (*MMOI)->getFlags();
+ flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+ MachineMemOperand *MMO =
+ MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+ (*MMOI)->getSize(),
+ (*MMOI)->getBaseAlignment(),
+ (*MMOI)->getTBAAInfo(),
+ (*MMOI)->getRanges());
+ MIB.addMemOperand(MMO);
+ };
+ MachineInstr *LowMI = MIB;
+
+ // Hi
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp) {
+ MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
+ } else {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ }
+ MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
+
+ thisMBB->addSuccessor(mainMBB);
+
+ // mainMBB:
+ MachineBasicBlock *origMainMBB = mainMBB;
+
+ // Add PHIs.
+ MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
+ .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+ MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
+ .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
+
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
+ case X86::ATOMAND6432:
+ case X86::ATOMOR6432:
+ case X86::ATOMXOR6432:
+ case X86::ATOMADD6432:
+ case X86::ATOMSUB6432: {
+ unsigned HiOpc;
+ unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
+ .addReg(SrcLoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
+ .addReg(SrcHiReg);
+ break;
+ }
+ case X86::ATOMNAND6432: {
+ unsigned HiOpc, NOTOpc;
+ unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
+ unsigned TmpL = MRI.createVirtualRegister(RC);
+ unsigned TmpH = MRI.createVirtualRegister(RC);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
+ .addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
+ .addReg(t4H);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
+ break;
+ }
+ case X86::ATOMMAX6432:
+ case X86::ATOMMIN6432:
+ case X86::ATOMUMAX6432:
+ case X86::ATOMUMIN6432: {
+ unsigned HiOpc;
+ unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+ unsigned cL = MRI.createVirtualRegister(RC8);
+ unsigned cH = MRI.createVirtualRegister(RC8);
+ unsigned cL32 = MRI.createVirtualRegister(RC);
+ unsigned cH32 = MRI.createVirtualRegister(RC);
+ unsigned cc = MRI.createVirtualRegister(RC);
+ // cl := cmp src_lo, lo
+ BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+ .addReg(SrcLoReg).addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
+ BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
+ // ch := cmp src_hi, hi
+ BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+ .addReg(SrcHiReg).addReg(t4H);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
+ BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
+ // cc := if (src_hi == hi) ? cl : ch;
+ if (Subtarget->hasCMov()) {
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
+ .addReg(cH32).addReg(cL32);
+ } else {
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
+ .addReg(cH32).addReg(cL32)
+ .addImm(X86::COND_E);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ }
+ BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
+ if (Subtarget->hasCMov()) {
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
+ .addReg(SrcLoReg).addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
+ .addReg(SrcHiReg).addReg(t4H);
+ } else {
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
+ .addReg(SrcLoReg).addReg(t4L)
+ .addImm(X86::COND_NE);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
+ // 2nd CMOV lowering.
+ mainMBB->addLiveIn(X86::EFLAGS);
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
+ .addReg(SrcHiReg).addReg(t4H)
+ .addImm(X86::COND_NE);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ // Replace the original PHI node as mainMBB is changed after CMOV
+ // lowering.
+ BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
+ .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+ BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
+ .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
+ PhiL->eraseFromParent();
+ PhiH->eraseFromParent();
+ }
+ break;
+ }
+ case X86::ATOMSWAP6432: {
+ unsigned HiOpc;
+ unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
+ break;
+ }
+ }
+
+ // Copy EDX:EAX back from HiReg:LoReg
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
+ // Copy ECX:EBX from t1H:t1L
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
+
+ MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ // Copy EDX:EAX back to t3H:t3L
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
+
+ BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
+
+ mainMBB->addSuccessor(origMainMBB);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), DstLoReg)
+ .addReg(t3L);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), DstHiReg)
+ .addReg(t3H);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
+ case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+ case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
+ case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+ case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
+ case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+ case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
+ case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+ }
+
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ BuildMI(*BB, MI, dl,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::XMM0);
+
+ MI->eraseFromParent();
+ return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
+ case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+ case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
+ case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+ case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
+ case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+ case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
+ case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+ }
+
+ DebugLoc dl = MI->getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI->getNumOperands(); // remove the results
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ BuildMI(*BB, MI, dl,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::ECX);
+
+ MI->eraseFromParent();
+ return BB;
+}
+
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII,
+ const X86Subtarget* Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+
+ // Address into RAX/EAX, other two args into ECX, EDX.
+ unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+ unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(i));
+
+ unsigned ValOps = X86::AddrNumOperands;
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI->getOperand(ValOps).getReg());
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+ .addReg(MI->getOperand(ValOps+1).getReg());
+
+ // The instruction doesn't actually take any operands though.
+ BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(
+ MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ // Emit va_arg instruction on X86-64.
+
+ // Operands to this pseudo-instruction:
+ // 0 ) Output : destination address (reg)
+ // 1-5) Input : va_list address (addr, i64mem)
+ // 6 ) ArgSize : Size (in bytes) of vararg type
+ // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+ // 8 ) Align : Alignment of type
+ // 9 ) EFLAGS (implicit-def)
+
+ assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+ assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+
+ unsigned DestReg = MI->getOperand(0).getReg();
+ MachineOperand &Base = MI->getOperand(1);
+ MachineOperand &Scale = MI->getOperand(2);
+ MachineOperand &Index = MI->getOperand(3);
+ MachineOperand &Disp = MI->getOperand(4);
+ MachineOperand &Segment = MI->getOperand(5);
+ unsigned ArgSize = MI->getOperand(6).getImm();
+ unsigned ArgMode = MI->getOperand(7).getImm();
+ unsigned Align = MI->getOperand(8).getImm();
+
+ // Memory Reference
+ assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ // Machine Information
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+ const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+ DebugLoc DL = MI->getDebugLoc();
+
+ // struct va_list {
+ // i32 gp_offset
+ // i32 fp_offset
+ // i64 overflow_area (address)
+ // i64 reg_save_area (address)
+ // }
+ // sizeof(va_list) = 24
+ // alignment(va_list) = 8
+
+ unsigned TotalNumIntRegs = 6;
+ unsigned TotalNumXMMRegs = 8;
+ bool UseGPOffset = (ArgMode == 1);
+ bool UseFPOffset = (ArgMode == 2);
+ unsigned MaxOffset = TotalNumIntRegs * 8 +
+ (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+ /* Align ArgSize to a multiple of 8 */
+ unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+ bool NeedsAlign = (Align > 8);
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *overflowMBB;
+ MachineBasicBlock *offsetMBB;
+ MachineBasicBlock *endMBB;
+
+ unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
+ unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
+ unsigned OffsetReg = 0;
+
+ if (!UseGPOffset && !UseFPOffset) {
+ // If we only pull from the overflow region, we don't create a branch.
+ // We don't need to alter control flow.
+ OffsetDestReg = 0; // unused
+ OverflowDestReg = DestReg;
+
+ offsetMBB = NULL;
+ overflowMBB = thisMBB;
+ endMBB = thisMBB;
+ } else {
+ // First emit code to check if gp_offset (or fp_offset) is below the bound.
+ // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+ // If not, pull from overflow_area. (branch to overflowMBB)
+ //
+ // thisMBB
+ // | .
+ // | .
+ // offsetMBB overflowMBB
+ // | .
+ // | .
+ // endMBB
+
+ // Registers for the PHI in endMBB
+ OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+ OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *MF = MBB->getParent();
+ overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ // Insert the new basic blocks
+ MF->insert(MBBIter, offsetMBB);
+ MF->insert(MBBIter, overflowMBB);
+ MF->insert(MBBIter, endMBB);
+
+ // Transfer the remainder of MBB and its successor edges to endMBB.
+ endMBB->splice(endMBB->begin(), thisMBB,
+ llvm::next(MachineBasicBlock::iterator(MI)),
+ thisMBB->end());
+ endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+ // Make offsetMBB and overflowMBB successors of thisMBB
+ thisMBB->addSuccessor(offsetMBB);
+ thisMBB->addSuccessor(overflowMBB);
+
+ // endMBB is a successor of both offsetMBB and overflowMBB
+ offsetMBB->addSuccessor(endMBB);
+ overflowMBB->addSuccessor(endMBB);
+
+ // Load the offset value into a register
+ OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Check if there is enough room left to pull this argument.
+ BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+ .addReg(OffsetReg)
+ .addImm(MaxOffset + 8 - ArgSizeA8);
+
+ // Branch to "overflowMBB" if offset >= max
+ // Fall through to "offsetMBB" otherwise
+ BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+ .addMBB(overflowMBB);
+ }
+
+ // In offsetMBB, emit code to use the reg_save_area.
+ if (offsetMBB) {
+ assert(OffsetReg != 0);
+
+ // Read the reg_save_area address.
+ unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 16)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Zero-extend the offset
+ unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addImm(X86::sub_32bit);
+
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+ .addReg(OffsetReg64)
+ .addReg(RegSaveReg);
+
+ // Compute the offset for the next argument
+ unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+ .addReg(OffsetReg)
+ .addImm(UseFPOffset ? 16 : 8);
+
+ // Store it back into the va_list.
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .addOperand(Segment)
+ .addReg(NextOffsetReg)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // Jump to endMBB
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+ .addMBB(endMBB);
+ }
+
+ //
+ // Emit code to use overflow area
+ //
+
+ // Load the overflow_area address into a register.
+ unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 8)
+ .addOperand(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // If we need to align it, do so. Otherwise, just copy the address
+ // to OverflowDestReg.
+ if (NeedsAlign) {
+ // Align the overflow address
+ assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+ unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+ // aligned_addr = (addr + (align-1)) & ~(align-1)
+ BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+ .addReg(OverflowAddrReg)
+ .addImm(Align-1);
+
+ BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+ .addReg(TmpReg)
+ .addImm(~(uint64_t)(Align-1));
+ } else {
+ BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+ .addReg(OverflowAddrReg);
+ }
+
+ // Compute the next overflow address after this argument.
+ // (the overflow address should be kept 8-byte aligned)
+ unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+ .addReg(OverflowDestReg)
+ .addImm(ArgSizeA8);
+
+ // Store the new overflow address.
+ BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+ .addOperand(Base)
+ .addOperand(Scale)
+ .addOperand(Index)
+ .addDisp(Disp, 8)
+ .addOperand(Segment)
+ .addReg(NextAddrReg)
+ .setMemRefs(MMOBegin, MMOEnd);
+
+ // If we branched, emit the PHI to the front of endMBB.
+ if (offsetMBB) {
+ BuildMI(*endMBB, endMBB->begin(), DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(OffsetDestReg).addMBB(offsetMBB)
+ .addReg(OverflowDestReg).addMBB(overflowMBB);
+ }
+
+ // Erase the pseudo instruction
+ MI->eraseFromParent();
+
+ return endMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ // Emit code to save XMM registers to the stack. The ABI says that the
+ // number of registers to save is given in %al, so it's theoretically
+ // possible to do an indirect jump trick to avoid saving all of them,
+ // however this code takes a simpler approach and just executes all
+ // of the stores if %al is non-zero. It's less code, and it's probably
+ // easier on the hardware branch predictor, and stores aren't all that
+ // expensive anyway.
+
+ // Create the new basic blocks. One block contains all the XMM stores,
+ // and one block is the final destination regardless of whether any
+ // stores were performed.
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *F = MBB->getParent();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+ MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, XMMSaveMBB);
+ F->insert(MBBIter, EndMBB);
+
+ // Transfer the remainder of MBB and its successor edges to EndMBB.
+ EndMBB->splice(EndMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)),
+ MBB->end());
+ EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // The original block will now fall through to the XMM save block.
+ MBB->addSuccessor(XMMSaveMBB);
+ // The XMMSaveMBB will fall through to the end block.
+ XMMSaveMBB->addSuccessor(EndMBB);
+
+ // Now add the instructions.
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned CountReg = MI->getOperand(0).getReg();
+ int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+ int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+ if (!Subtarget->isTargetWin64()) {
+ // If %al is 0, branch around the XMM save block.
+ BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+ BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+ MBB->addSuccessor(EndMBB);
+ }
+
+ unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ // In the XMM save block, save all the XMM argument registers.
+ for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+ int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+ MachineMemOperand *MMO =
+ F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
+ MachineMemOperand::MOStore,
+ /*Size=*/16, /*Align=*/16);
+ BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+ .addFrameIndex(RegSaveFrameIndex)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(MI->getOperand(i).getReg())
+ .addMemOperand(MMO);
+ }
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+
+ return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock* BB,
+ const TargetRegisterInfo* TRI) {
+ // Scan forward through BB for a use/def of EFLAGS.
+ MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+ for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(X86::EFLAGS))
+ return false;
+ if (mi.definesRegister(X86::EFLAGS))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether EFLAGS is live into a
+ // successor.
+ if (miI == BB->end()) {
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+ }
+
+ // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+ // out. SelectMI should have a kill flag on EFLAGS.
+ SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+ return true;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = BB;
+ ++It;
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+ if (!MI->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+ copy0MBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ llvm::next(MachineBasicBlock::iterator(MI)),
+ BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ // Create the conditional branch instruction.
+ unsigned Opc =
+ X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+ BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ copy0MBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(X86::PHI), MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+ bool Is64Bit) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+ assert(getTargetMachine().Options.EnableSegmentedStacks);
+
+ unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+ // BB:
+ // ... [Till the alloca]
+ // If stacklet is not large enough, jump to mallocMBB
+ //
+ // bumpMBB:
+ // Allocate by subtracting from RSP
+ // Jump to continueMBB
+ //
+ // mallocMBB:
+ // Allocate by call to runtime
+ //
+ // continueMBB:
+ // ...
+ // [rest of original BB]
+ //
+
+ MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+ unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI->getOperand(1).getReg(),
+ physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+ MachineFunction::iterator MBBIter = BB;
+ ++MBBIter;
+
+ MF->insert(MBBIter, bumpMBB);
+ MF->insert(MBBIter, mallocMBB);
+ MF->insert(MBBIter, continueMBB);
+
+ continueMBB->splice(continueMBB->begin(), BB, llvm::next
+ (MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add code to the main basic block to check if the stack limit has been hit,
+ // and if so, jump to mallocMBB otherwise to bumpMBB.
+ BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+ .addReg(tmpSPVReg).addReg(sizeVReg);
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(SPLimitVReg);
+ BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+ // bumpMBB simply decreases the stack pointer, since we know the current
+ // stacklet has enough space.
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+ // Calls into a routine in libgcc to allocate more space from the heap.
+ const uint32_t *RegMask =
+ getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
+ .addReg(X86::RAX, RegState::ImplicitDefine);
+ } else {
+ BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+ .addImm(12);
+ BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ }
+
+ if (!Is64Bit)
+ BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+ .addImm(16);
+
+ BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+ .addReg(Is64Bit ? X86::RAX : X86::EAX);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+ // Set up the CFG correctly.
+ BB->addSuccessor(bumpMBB);
+ BB->addSuccessor(mallocMBB);
+ mallocMBB->addSuccessor(continueMBB);
+ bumpMBB->addSuccessor(continueMBB);
+
+ // Take care of the PHI nodes.
+ BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(mallocPtrVReg).addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+ // Delete the original pseudo instruction.
+ MI->eraseFromParent();
+
+ // And we're done.
+ return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ assert(!Subtarget->isTargetEnvMacho());
+
+ // The lowering is pretty easy: we're just emitting the call to _alloca. The
+ // non-trivial part is impdef of ESP.
+
+ if (Subtarget->isTargetWin64()) {
+ if (Subtarget->isTargetCygMing()) {
+ // ___chkstk(Mingw64):
+ // Clobbers R10, R11, RAX and EFLAGS.
+ // Updates RSP.
+ BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+ .addExternalSymbol("___chkstk")
+ .addReg(X86::RAX, RegState::Implicit)
+ .addReg(X86::RSP, RegState::Implicit)
+ .addReg(X86::RAX, RegState::Define | RegState::Implicit)
+ .addReg(X86::RSP, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+ } else {
+ // __chkstk(MSVCRT): does not update stack pointer.
+ // Clobbers R10, R11 and EFLAGS.
+ // FIXME: RAX(allocated size) might be reused and not killed.
+ BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+ .addExternalSymbol("__chkstk")
+ .addReg(X86::RAX, RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+ // RAX has the offset to subtracted from RSP.
+ BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(X86::RAX);
+ }
+ } else {
+ const char *StackProbeSymbol =
+ Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
+
+ BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol(StackProbeSymbol)
+ .addReg(X86::EAX, RegState::Implicit)
+ .addReg(X86::ESP, RegState::Implicit)
+ .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+ .addReg(X86::ESP, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+ }
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ // This is pretty easy. We're taking the value that we received from
+ // our load from the relocation, sticking it in either RDI (x86-64)
+ // or EAX and doing an indirect call. The return value will then
+ // be in the normal return register.
+ const X86InstrInfo *TII
+ = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *F = BB->getParent();
+
+ assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
+ assert(MI->getOperand(3).isGlobal() && "This should be a global");
+
+ // Get a register mask for the lowered call.
+ // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+ // proper register mask.
+ const uint32_t *RegMask =
+ getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ if (Subtarget->is64Bit()) {
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+ TII->get(X86::MOV64rm), X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(0).addReg(0)
+ .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+ MI->getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+ addDirectMem(MIB, X86::RDI);
+ MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+ TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(0)
+ .addImm(0).addReg(0)
+ .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+ MI->getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else {
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+ TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(TII->getGlobalBaseReg(F))
+ .addImm(0).addReg(0)
+ .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+ MI->getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ }
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstReg;
+ unsigned MemOpndSlot = 0;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI->getOperand(CurOp++).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+ MemOpndSlot = CurOp;
+
+ MVT PVT = getPointerTy();
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // buf[LabelOffset] = restoreMBB
+ // SjLjSetup restoreMBB
+ //
+ // mainMBB:
+ // v_main = 0
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+ //
+ // restoreMBB:
+ // v_restore = 1
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+ MF->push_back(restoreMBB);
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ unsigned PtrStoreOpc = 0;
+ unsigned LabelReg = 0;
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ Reloc::Model RM = getTargetMachine().getRelocationModel();
+ bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+ (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+ // Prepare IP either in reg or imm.
+ if (!UseImmLabel) {
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ LabelReg = MRI.createVirtualRegister(PtrRC);
+ if (Subtarget->is64Bit()) {
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB)
+ .addReg(0);
+ } else {
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+ .addReg(XII->getGlobalBaseReg(MF))
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+ .addReg(0);
+ }
+ } else
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ // Store IP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+ else
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ }
+ if (!UseImmLabel)
+ MIB.addReg(LabelReg);
+ else
+ MIB.addMBB(restoreMBB);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Setup
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+ .addMBB(restoreMBB);
+ MIB.addRegMask(RegInfo->getNoPreservedMask());
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(restoreMBB);
+
+ // mainMBB:
+ // EAX = 0
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(restoreMBB);
+
+ // restoreMBB:
+ BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+ restoreMBB->addSuccessor(sinkMBB);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ MVT PVT = getPointerTy();
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ const TargetRegisterClass *RC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ unsigned SP = RegInfo->getStackRegister();
+
+ MachineInstrBuilder MIB;
+
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+ // Reload FP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload IP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(i), LabelOffset);
+ else
+ MIB.addOperand(MI->getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload SP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(i), SPOffset);
+ else
+ MIB.addOperand(MI->getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Jump
+ BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+ MI->eraseFromParent();
+ return MBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected instr type to insert");
+ case X86::TAILJMPd64:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ llvm_unreachable("TAILJMP64 would not be touched here.");
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ return BB;
+ case X86::WIN_ALLOCA:
+ return EmitLoweredWinAlloca(MI, BB);
+ case X86::SEG_ALLOCA_32:
+ return EmitLoweredSegAlloca(MI, BB, false);
+ case X86::SEG_ALLOCA_64:
+ return EmitLoweredSegAlloca(MI, BB, true);
+ case X86::TLSCall_32:
+ case X86::TLSCall_64:
+ return EmitLoweredTLSCall(MI, BB);
+ case X86::CMOV_GR8:
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ return EmitLoweredSelect(MI, BB);
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM:
+ case X86::FP80_TO_INT16_IN_MEM:
+ case X86::FP80_TO_INT32_IN_MEM:
+ case X86::FP80_TO_INT64_IN_MEM: {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ MachineFunction *F = BB->getParent();
+ int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Load the old value of the high byte of the control word...
+ unsigned OldCW =
+ F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+ CWFrameIdx);
+
+ // Set the high part to be round to zero...
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+ .addImm(0xC7F);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ // Restore the memory image of control word to original value
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+ .addReg(OldCW);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+ case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+ case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+ }
+
+ X86AddressMode AM;
+ MachineOperand &Op = MI->getOperand(0);
+ if (Op.isReg()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op.getIndex();
+ }
+ Op = MI->getOperand(1);
+ if (Op.isImm())
+ AM.Scale = Op.getImm();
+ Op = MI->getOperand(2);
+ if (Op.isImm())
+ AM.IndexReg = Op.getImm();
+ Op = MI->getOperand(3);
+ if (Op.isGlobal()) {
+ AM.GV = Op.getGlobal();
+ } else {
+ AM.Disp = Op.getImm();
+ }
+ addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+ .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+ }
+ // String/text processing lowering.
+ case X86::PCMPISTRM128REG:
+ case X86::VPCMPISTRM128REG:
+ case X86::PCMPISTRM128MEM:
+ case X86::VPCMPISTRM128MEM:
+ case X86::PCMPESTRM128REG:
+ case X86::VPCMPESTRM128REG:
+ case X86::PCMPESTRM128MEM:
+ case X86::VPCMPESTRM128MEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+ // String/text processing lowering.
+ case X86::PCMPISTRIREG:
+ case X86::VPCMPISTRIREG:
+ case X86::PCMPISTRIMEM:
+ case X86::VPCMPISTRIMEM:
+ case X86::PCMPESTRIREG:
+ case X86::VPCMPESTRIREG:
+ case X86::PCMPESTRIMEM:
+ case X86::VPCMPESTRIMEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+
+ // Thread synchronization.
+ case X86::MONITOR:
+ return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
+
+ // xbegin
+ case X86::XBEGIN:
+ return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
+
+ // Atomic Lowering.
+ case X86::ATOMAND8:
+ case X86::ATOMAND16:
+ case X86::ATOMAND32:
+ case X86::ATOMAND64:
+ // Fall through
+ case X86::ATOMOR8:
+ case X86::ATOMOR16:
+ case X86::ATOMOR32:
+ case X86::ATOMOR64:
+ // Fall through
+ case X86::ATOMXOR16:
+ case X86::ATOMXOR8:
+ case X86::ATOMXOR32:
+ case X86::ATOMXOR64:
+ // Fall through
+ case X86::ATOMNAND8:
+ case X86::ATOMNAND16:
+ case X86::ATOMNAND32:
+ case X86::ATOMNAND64:
+ // Fall through
+ case X86::ATOMMAX8:
+ case X86::ATOMMAX16:
+ case X86::ATOMMAX32:
+ case X86::ATOMMAX64:
+ // Fall through
+ case X86::ATOMMIN8:
+ case X86::ATOMMIN16:
+ case X86::ATOMMIN32:
+ case X86::ATOMMIN64:
+ // Fall through
+ case X86::ATOMUMAX8:
+ case X86::ATOMUMAX16:
+ case X86::ATOMUMAX32:
+ case X86::ATOMUMAX64:
+ // Fall through
+ case X86::ATOMUMIN8:
+ case X86::ATOMUMIN16:
+ case X86::ATOMUMIN32:
+ case X86::ATOMUMIN64:
+ return EmitAtomicLoadArith(MI, BB);
+
+ // This group does 64-bit operations on a 32-bit host.
+ case X86::ATOMAND6432:
+ case X86::ATOMOR6432:
+ case X86::ATOMXOR6432:
+ case X86::ATOMNAND6432:
+ case X86::ATOMADD6432:
+ case X86::ATOMSUB6432:
+ case X86::ATOMMAX6432:
+ case X86::ATOMMIN6432:
+ case X86::ATOMUMAX6432:
+ case X86::ATOMUMIN6432:
+ case X86::ATOMSWAP6432:
+ return EmitAtomicLoadArith6432(MI, BB);
+
+ case X86::VASTART_SAVE_XMM_REGS:
+ return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+ case X86::VAARG_64:
+ return EmitVAARG64WithCustomInserter(MI, BB);
+
+ case X86::EH_SjLj_SetJmp32:
+ case X86::EH_SjLj_SetJmp64:
+ return emitEHSjLjSetJmp(MI, BB);
+
+ case X86::EH_SjLj_LongJmp32:
+ case X86::EH_SjLj_LongJmp64:
+ return emitEHSjLjLongJmp(MI, BB);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned BitWidth = KnownZero.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
+ switch (Opc) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ // These nodes' second result is a boolean.
+ if (Op.getResNo() == 0)
+ break;
+ // Fallthrough
+ case X86ISD::SETCC:
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+ break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned NumLoBits = 0;
+ switch (IntId) {
+ default: break;
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx2_pmovmskb: {
+ // High bits of movmskp{s|d}, pmovmskb are known zero.
+ switch (IntId) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
+ case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
+ case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
+ case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
+ case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
+ case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
+ case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
+ }
+ KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+ break;
+ }
+ }
+ break;
+ }
+ }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+ unsigned Depth) const {
+ // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+ if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+ return Op.getValueType().getScalarType().getSizeInBits();
+
+ // Fallback case.
+ return 1;
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+ const GlobalValue* &GA,
+ int64_t &Offset) const {
+ if (N->getOpcode() == X86ISD::Wrapper) {
+ if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+ GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+ Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+ return true;
+ }
+ }
+ return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+ SVOp->getMaskElt(j) >= 0)
+ return false;
+
+ return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+ SVOp->getMaskElt(j) >= 0)
+ return false;
+
+ return true;
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget* Subtarget) {
+ DebugLoc dl = N->getDebugLoc();
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ EVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+ V2.getOpcode() == ISD::CONCAT_VECTORS) {
+ //
+ // 0,0,0,...
+ // |
+ // V UNDEF BUILD_VECTOR UNDEF
+ // \ / \ /
+ // CONCAT_VECTOR CONCAT_VECTOR
+ // \ /
+ // \ /
+ // RESULT: V + zero extended
+ //
+ if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+ V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+ V1.getOperand(1).getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+ return SDValue();
+
+ // To match the shuffle mask, the first half of the mask should
+ // be exactly the first vector, and all the rest a splat with the
+ // first element of the second one.
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+ !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+ return SDValue();
+
+ // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+ if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+ if (Ld->hasNUsesOfValue(1, 0)) {
+ SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+ SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ array_lengthof(Ops),
+ Ld->getMemoryVT(),
+ Ld->getPointerInfo(),
+ Ld->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+
+ // Make sure the newly-created LOAD is in the same position as Ld in
+ // terms of dependency. We create a TokenFactor for Ld and ResNode,
+ // and update uses of Ld's output chain to use the TokenFactor.
+ if (Ld->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+ }
+ }
+
+ // Emit a zeroed vector and insert the desired subvector on its
+ // first half.
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Combine some shuffles into subvector extracts and inserts:
+ //
+
+ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (isShuffleHigh128VectorInsertLow(SVOp)) {
+ SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (isShuffleLow128VectorInsertHigh(SVOp)) {
+ SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ return SDValue();
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+
+ // Don't create instructions with illegal types after legalize types has run.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+ return SDValue();
+
+ // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+ if (Subtarget->hasFp256() && VT.is256BitVector() &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE)
+ return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+
+ // Only handle 128 wide vector from here on.
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+ // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+ // consecutive, non-overlapping, and in the right order.
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+ Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+ return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+}
+
+/// PerformTruncateCombine - Converts truncate operation to
+/// a sequence of vector shuffle operations.
+/// It is possible when we truncate 256-bit vector to 128-bit vector
+static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been customed lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue InVec = N->getOperand(0);
+ SDValue EltNo = N->getOperand(1);
+
+ if (!isa<ConstantSDNode>(EltNo))
+ return SDValue();
+
+ EVT VT = InVec.getValueType();
+
+ bool HasShuffleIntoBitcast = false;
+ if (InVec.getOpcode() == ISD::BITCAST) {
+ // Don't duplicate a load with other uses.
+ if (!InVec.hasOneUse())
+ return SDValue();
+ EVT BCVT = InVec.getOperand(0).getValueType();
+ if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+ return SDValue();
+ InVec = InVec.getOperand(0);
+ HasShuffleIntoBitcast = true;
+ }
+
+ if (!isTargetShuffle(InVec.getOpcode()))
+ return SDValue();
+
+ // Don't duplicate a load with other uses.
+ if (!InVec.hasOneUse())
+ return SDValue();
+
+ SmallVector<int, 16> ShuffleMask;
+ bool UnaryShuffle;
+ if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+ UnaryShuffle))
+ return SDValue();
+
+ // Select the input vector, guarding against out of range extract vector.
+ unsigned NumElems = VT.getVectorNumElements();
+ int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+ int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+ SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+ : InVec.getOperand(1);
+
+ // If inputs to shuffle are the same for both ops, then allow 2 uses
+ unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+ if (LdNode.getOpcode() == ISD::BITCAST) {
+ // Don't duplicate a load with other uses.
+ if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+ return SDValue();
+
+ AllowedUses = 1; // only allow 1 load use if we have a bitcast
+ LdNode = LdNode.getOperand(0);
+ }
+
+ if (!ISD::isNormalLoad(LdNode.getNode()))
+ return SDValue();
+
+ LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+ if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+ return SDValue();
+
+ if (HasShuffleIntoBitcast) {
+ // If there's a bitcast before the shuffle, check if the load type and
+ // alignment is valid.
+ unsigned Align = LN0->getAlignment();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NewAlign = TLI.getDataLayout()->
+ getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+ if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+ return SDValue();
+ }
+
+ // All checks match so transform back to vector_shuffle so that DAG combiner
+ // can finish the job
+ DebugLoc dl = N->getDebugLoc();
+
+ // Create shuffle node taking into account the case that its a unary shuffle
+ SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
+ Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+ InVec.getOperand(0), Shuffle,
+ &ShuffleMask[0]);
+ Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+ EltNo);
+}
+
+/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
+/// generation and convert it from being a bunch of shuffles and extracts
+/// to a simple store and scalar loads to extract the elements.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
+ if (NewOp.getNode())
+ return NewOp;
+
+ SDValue InputVector = N->getOperand(0);
+ // Detect whether we are trying to convert from mmx to i32 and the bitcast
+ // from mmx to v2i32 has a single usage.
+ if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
+ InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
+ InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // Only operate on vectors of 4 elements, where the alternative shuffling
+ // gets to be more expensive.
+ if (InputVector.getValueType() != MVT::v4i32)
+ return SDValue();
+
+ // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+ // single use which is a sign-extend or zero-extend, and all elements are
+ // used.
+ SmallVector<SDNode *, 4> Uses;
+ unsigned ExtractedElements = 0;
+ for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+ UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != InputVector.getResNo())
+ return SDValue();
+
+ SDNode *Extract = *UI;
+ if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (Extract->getValueType(0) != MVT::i32)
+ return SDValue();
+ if (!Extract->hasOneUse())
+ return SDValue();
+ if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+ Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+ return SDValue();
+
+ // Record which element was extracted.
+ ExtractedElements |=
+ 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+ Uses.push_back(Extract);
+ }
+
+ // If not all the elements were used, this may not be worthwhile.
+ if (ExtractedElements != 15)
+ return SDValue();
+
+ // Ok, we've now decided to do the transformation.
+ DebugLoc dl = InputVector.getDebugLoc();
+
+ // Store the value to a temporary stack slot.
+ SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+ MachinePointerInfo(), false, false, 0);
+
+ // Replace each use (extract) with a load of the appropriate element.
+ for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+ UE = Uses.end(); UI != UE; ++UI) {
+ SDNode *Extract = *UI;
+
+ // cOMpute the element's address.
+ SDValue Idx = Extract->getOperand(1);
+ unsigned EltSize =
+ InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
+ uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+ SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+ StackPtr, OffsetVal);
+
+ // Load the scalar.
+ SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
+ ScalarAddr, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // Replace the exact with the load.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+ }
+
+ // The replacement was made in place; don't return anything.
+ return SDValue();
+}
+
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ if (!VT.isVector())
+ return 0;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ if (!Subtarget->hasAVX2())
+ return 0;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ if (!Subtarget->hasSSE2())
+ return 0;
+ }
+
+ // SSE2 has only a small subset of the operations.
+ bool hasUnsigned = Subtarget->hasSSE41() ||
+ (Subtarget->hasSSE2() && VT == MVT::v16i8);
+ bool hasSigned = Subtarget->hasSSE41() ||
+ (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ return hasUnsigned ? X86ISD::UMIN : 0;
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ return hasUnsigned ? X86ISD::UMAX : 0;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ return hasSigned ? X86ISD::SMIN : 0;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ return hasSigned ? X86ISD::SMAX : 0;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ return hasUnsigned ? X86ISD::UMAX : 0;
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ return hasUnsigned ? X86ISD::UMIN : 0;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ return hasSigned ? X86ISD::SMAX : 0;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ return hasSigned ? X86ISD::SMIN : 0;
+ }
+ }
+
+ return 0;
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+ SDValue Cond = N->getOperand(0);
+ // Get the LHS/RHS of the select.
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
+
+ // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+ // instructions match the semantics of the common C idiom x<y?x:y but not
+ // x<=y?x:y, because of how they handle negative zero (which can be
+ // ignored in unsafe-math mode).
+ if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+ VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ (Subtarget->hasSSE2() ||
+ (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ // Converting this to a min would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETULE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a max would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a min would handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETULT:
+ // Converting this to a max would handle NaNs incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETULE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+ // Don't do this for crazy integer types.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 &&
+ TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+ }
+ }
+
+ // Canonicalize max and min:
+ // (x > y) ? x : y -> (x >= y) ? x : y
+ // (x < y) ? x : y -> (x <= y) ? x : y
+ // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+ // the need for an extra compare
+ // against zero. e.g.
+ // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+ // subl %esi, %edi
+ // testl %edi, %edi
+ // movl $0, %eax
+ // cmovgl %edi, %eax
+ // =>
+ // xorl %eax, %eax
+ // subl %esi, $edi
+ // cmovsl %eax, %edi
+ if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ switch (CC) {
+ default: break;
+ case ISD::SETLT:
+ case ISD::SETGT: {
+ ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+ Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ }
+ }
+ }
+
+ // Match VSELECTs into subs with unsigned saturation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+ ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+ // left side invert the predicate to simplify logic below.
+ SDValue Other;
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ Other = RHS;
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ Other = LHS;
+ }
+
+ if (Other.getNode() && Other->getNumOperands() == 2 &&
+ DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+ SDValue CondRHS = Cond->getOperand(1);
+
+ // Look for a general sub with unsigned saturation first.
+ // x >= y ? x-y : 0 --> subus x, y
+ // x > y ? x-y : 0 --> subus x, y
+ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+ Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+ // If the RHS is a constant we have to reverse the const canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (CondRHS.getConstantOperandVal(0) == -A-1)
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(-A, VT));
+ }
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+ // it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (A.isSignBit())
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+ }
+ }
+ }
+
+ // Try to match a min/max vector operation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
+ if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
+ return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+
+ // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
+ if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
+ Cond.getOpcode() == ISD::SETCC) {
+
+ assert(Cond.getValueType().isVector() &&
+ "vector select expects a vector selector!");
+
+ EVT IntVT = Cond.getValueType();
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if (!TValIsAllOnes && !FValIsAllZeros) {
+ // Try invert the condition if true value is not all 1s and false value
+ // is not all 0s.
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+ if (TValIsAllZeros || FValIsAllOnes) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode NewCC =
+ ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ Cond.getOperand(0).getValueType().isInteger());
+ Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ std::swap(LHS, RHS);
+ TValIsAllOnes = FValIsAllOnes;
+ FValIsAllZeros = TValIsAllZeros;
+ }
+ }
+
+ if (TValIsAllOnes || FValIsAllZeros) {
+ SDValue Ret;
+
+ if (TValIsAllOnes && FValIsAllZeros)
+ Ret = Cond;
+ else if (TValIsAllOnes)
+ Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
+ DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
+ else if (FValIsAllZeros)
+ Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
+ DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
+
+ return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+ }
+ }
+
+ // If we know that this node is legal then we know that it is going to be
+ // matched by one of the SSE/AVX BLEND instructions. These instructions only
+ // depend on the highest bit in each word. Try to use SimplifyDemandedBits
+ // to simplify previous instructions.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+ !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+ unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+ // Don't optimize vector selects that map to mask-registers.
+ if (BitWidth == 1)
+ return SDValue();
+
+ assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+ APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+
+ return SDValue();
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+ // Quit if not CMP and SUB with its value result used.
+ if (Cmp.getOpcode() != X86ISD::CMP &&
+ (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+ return SDValue();
+
+ // Quit if not used as a boolean value.
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ // Check CMP operands. One of them should be 0 or 1 and the other should be
+ // an SetCC or extended from it.
+ SDValue Op1 = Cmp.getOperand(0);
+ SDValue Op2 = Cmp.getOperand(1);
+
+ SDValue SetCC;
+ const ConstantSDNode* C = 0;
+ bool needOppositeCond = (CC == X86::COND_E);
+ bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+ if ((C = dyn_cast<ConstantSDNode>(Op1)))
+ SetCC = Op2;
+ else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+ SetCC = Op1;
+ else // Quit if all operands are not constants.
+ return SDValue();
+
+ if (C->getZExtValue() == 1) {
+ needOppositeCond = !needOppositeCond;
+ checkAgainstTrue = true;
+ } else if (C->getZExtValue() != 0)
+ // Quit if the constant is neither 0 or 1.
+ return SDValue();
+
+ bool truncatedToBoolWithAnd = false;
+ // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+ while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+ SetCC.getOpcode() == ISD::TRUNCATE ||
+ SetCC.getOpcode() == ISD::AND) {
+ if (SetCC.getOpcode() == ISD::AND) {
+ int OpIdx = -1;
+ ConstantSDNode *CS;
+ if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
+ CS->getZExtValue() == 1)
+ OpIdx = 1;
+ if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
+ CS->getZExtValue() == 1)
+ OpIdx = 0;
+ if (OpIdx == -1)
+ break;
+ SetCC = SetCC.getOperand(OpIdx);
+ truncatedToBoolWithAnd = true;
+ } else
+ SetCC = SetCC.getOperand(0);
+ }
+
+ switch (SetCC.getOpcode()) {
+ case X86ISD::SETCC_CARRY:
+ // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+ // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+ // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+ // truncated to i1 using 'and'.
+ if (checkAgainstTrue && !truncatedToBoolWithAnd)
+ break;
+ assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+ "Invalid use of SETCC_CARRY!");
+ // FALL THROUGH
+ case X86ISD::SETCC:
+ // Set the condition code or opposite one if necessary.
+ CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(1);
+ case X86ISD::CMOV: {
+ // Check whether false/true value has canonical one, i.e. 0 or 1.
+ ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+ ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+ // Quit if true value is not a constant.
+ if (!TVal)
+ return SDValue();
+ // Quit if false value is not a constant.
+ if (!FVal) {
+ SDValue Op = SetCC.getOperand(0);
+ // Skip 'zext' or 'trunc' node.
+ if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+ Op.getOpcode() == ISD::TRUNCATE)
+ Op = Op.getOperand(0);
+ // A special case for rdrand/rdseed, where 0 is set if false cond is
+ // found.
+ if ((Op.getOpcode() != X86ISD::RDRAND &&
+ Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+ return SDValue();
+ }
+ // Quit if false value is not the constant 0 or 1.
+ bool FValIsFalse = true;
+ if (FVal && FVal->getZExtValue() != 0) {
+ if (FVal->getZExtValue() != 1)
+ return SDValue();
+ // If FVal is 1, opposite cond is needed.
+ needOppositeCond = !needOppositeCond;
+ FValIsFalse = false;
+ }
+ // Quit if TVal is not the constant opposite of FVal.
+ if (FValIsFalse && TVal->getZExtValue() != 1)
+ return SDValue();
+ if (!FValIsFalse && TVal->getZExtValue() != 0)
+ return SDValue();
+ CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(3);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+
+ // If the flag operand isn't dead, don't touch this CMOV.
+ if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+ return SDValue();
+
+ SDValue FalseOp = N->getOperand(0);
+ SDValue TrueOp = N->getOperand(1);
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+ SDValue Cond = N->getOperand(3);
+
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ switch (Cond.getOpcode()) {
+ default: break;
+ case X86ISD::BSR:
+ case X86ISD::BSF:
+ // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+ if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+ return (CC == X86::COND_E) ? FalseOp : TrueOp;
+ }
+ }
+
+ SDValue Flags;
+
+ Flags = checkBoolTestSetCCCombine(Cond, CC);
+ if (Flags.getNode() &&
+ // Extra check as FCMOV only supports a subset of X86 cond.
+ (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+ SDValue Ops[] = { FalseOp, TrueOp,
+ DAG.getConstant(CC, MVT::i8), Flags };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+ Ops, array_lengthof(Ops));
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+ }
+ }
+ }
+
+ // Handle these cases:
+ // (select (x != c), e, c) -> select (x != c), e, x),
+ // (select (x == c), c, e) -> select (x == c), x, e)
+ // where the c is an integer constant, and the "select" is the combination
+ // of CMOV and CMP.
+ //
+ // The rationale for this change is that the conditional-move from a constant
+ // needs two instructions, however, conditional-move from a register needs
+ // only one instruction.
+ //
+ // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+ // some instruction-combining opportunities. This opt needs to be
+ // postponed as late as possible.
+ //
+ if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+ // the DCI.xxxx conditions are provided to postpone the optimization as
+ // late as possible.
+
+ ConstantSDNode *CmpAgainst = 0;
+ if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+ (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+ !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+ if (CC == X86::COND_NE &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ if (CC == X86::COND_E &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+ SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+ DAG.getConstant(CC, MVT::i8), Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
+ array_lengthof(Ops));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ uint64_t MulAmt = C->getZExtValue();
+ if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return SDValue();
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((MulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = MulAmt / 9;
+ } else if ((MulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = MulAmt / 5;
+ } else if ((MulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = MulAmt / 3;
+ }
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+ DebugLoc DL = N->getDebugLoc();
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add.
+ std::swap(MulAmt1, MulAmt2);
+
+ SDValue NewMul;
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, VT));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, NewMul, false);
+ }
+ return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+ // since the result of setcc_c is all zero's or all ones.
+ if (VT.isInteger() && !VT.isVector() &&
+ N1C && N0.getOpcode() == ISD::AND &&
+ N0.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
+ ((N00.getOpcode() == ISD::ANY_EXTEND ||
+ N00.getOpcode() == ISD::ZERO_EXTEND) &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
+ APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt ShAmt = N1C->getAPIntValue();
+ Mask = Mask.shl(ShAmt);
+ if (Mask != 0)
+ return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+ N00, DAG.getConstant(Mask, VT));
+ }
+ }
+
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl.
+ // (shl V, 1) -> add V,V
+ if (isSplatVector(N1.getNode())) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1C && (1 == N1C->getZExtValue())) {
+ return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformShiftCombine - Combine shifts.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (N->getOpcode() == ISD::SHL) {
+ SDValue V = PerformSHLCombine(N, DAG);
+ if (V.getNode()) return V;
+ }
+
+ return SDValue();
+}
+
+// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends. Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ unsigned opcode;
+
+ // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+ // we're requiring SSE2 for both.
+ if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue CMP0 = N0->getOperand(1);
+ SDValue CMP1 = N1->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // The SETCCs should both refer to the same CMP.
+ if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+ return SDValue();
+
+ SDValue CMP00 = CMP0->getOperand(0);
+ SDValue CMP01 = CMP0->getOperand(1);
+ EVT VT = CMP00.getValueType();
+
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ bool ExpectingFlags = false;
+ // Check for any users that want flags:
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+ !ExpectingFlags && UI != UE; ++UI)
+ switch (UI->getOpcode()) {
+ default:
+ case ISD::BR_CC:
+ case ISD::BRCOND:
+ case ISD::SELECT:
+ ExpectingFlags = true;
+ break;
+ case ISD::CopyToReg:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ break;
+ }
+
+ if (!ExpectingFlags) {
+ enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+ enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+ if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+ X86::CondCode tmp = cc0;
+ cc0 = cc1;
+ cc1 = tmp;
+ }
+
+ if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
+ (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+ bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+ X86ISD::NodeType NTOperator = is64BitFP ?
+ X86ISD::FSETCCsd : X86ISD::FSETCCss;
+ // FIXME: need symbolic constants for these magic numbers.
+ // See X86ATTInstPrinter.cpp:printSSECC().
+ unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+ SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+ DAG.getConstant(x86cc, MVT::i8));
+ SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
+ OnesOrZeroesF);
+ SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
+ DAG.getConstant(1, MVT::i32));
+ SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+ return OneBitOfTruth;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ // Match direct AllOnes for 128 and 256-bit vectors
+ if (ISD::isBuildVectorAllOnes(N))
+ return true;
+
+ // Look through a bit convert.
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0).getNode();
+
+ // Sometimes the operand may come from a insert_subvector building a 256-bit
+ // allones vector
+ if (VT.is256BitVector() &&
+ N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+ ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+ ISD::isBuildVectorAllOnes(V2.getNode()))
+ return true;
+ }
+
+ return false;
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ assert((N->getOpcode() == ISD::ANY_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+ SDValue Narrow = N->getOperand(0);
+ EVT NarrowVT = Narrow->getValueType(0);
+ if (!NarrowVT.is128BitVector())
+ return SDValue();
+
+ if (Narrow->getOpcode() != ISD::XOR &&
+ Narrow->getOpcode() != ISD::AND &&
+ Narrow->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = Narrow->getOperand(0);
+ SDValue N1 = Narrow->getOperand(1);
+ DebugLoc DL = Narrow->getDebugLoc();
+
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ EVT WideVT = N0->getOperand(0)->getValueType(0);
+ if (WideVT != VT)
+ return SDValue();
+
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+ bool RHSConst = (isSplatVector(N1.getNode()) &&
+ isa<ConstantSDNode>(N1->getOperand(0)));
+ if (!RHSTrunc && !RHSConst)
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+ return SDValue();
+
+ // Set N0 and N1 to hold the inputs to the new wide operation.
+ N0 = N0->getOperand(0);
+ if (RHSConst) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
+ N1->getOperand(0));
+ SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+ } else if (RHSTrunc) {
+ N1 = N1->getOperand(0);
+ }
+
+ // Generate the wide operation.
+ SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND: {
+ unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+ APInt Mask = APInt::getAllOnesValue(InBits);
+ Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+ return DAG.getNode(ISD::AND, DL, VT,
+ Op, DAG.getConstant(Mask, VT));
+ }
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+ Op, DAG.getValueType(NarrowVT));
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
+
+ // Create BLSI, and BLSR instructions
+ // BLSI is X & (-X)
+ // BLSR is X & (X-1)
+ if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Check LHS for neg
+ if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+ isZero(N0.getOperand(0)))
+ return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+ // Check RHS for neg
+ if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+ isZero(N1.getOperand(0)))
+ return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+ // Check LHS for X-1
+ if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+ isAllOnes(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+ // Check RHS for X-1
+ if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+ isAllOnes(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+
+ return SDValue();
+ }
+
+ // Want to form ANDNP nodes:
+ // 1) In the hopes of then easily combining them with OR and AND nodes
+ // to form PBLEND/PSIGN.
+ // 2) To match ANDN packed intrinsics
+ if (VT != MVT::v2i64 && VT != MVT::v4i64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Check LHS for vnot
+ if (N0.getOpcode() == ISD::XOR &&
+ //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+ CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+
+ // Check RHS for vnot
+ if (N1.getOpcode() == ISD::XOR &&
+ //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+ CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+
+ return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // look for psign/blend
+ if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+ if (!Subtarget->hasSSSE3() ||
+ (VT == MVT::v4i64 && !Subtarget->hasInt256()))
+ return SDValue();
+
+ // Canonicalize pandn to RHS
+ if (N0.getOpcode() == X86ISD::ANDNP)
+ std::swap(N0, N1);
+ // or (and (m, y), (pandn m, x))
+ if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+ SDValue Mask = N1.getOperand(0);
+ SDValue X = N1.getOperand(1);
+ SDValue Y;
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
+
+ // Check to see if the mask appeared in both the AND and ANDNP and
+ if (!Y.getNode())
+ return SDValue();
+
+ // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+ // Look through mask bitcast.
+ if (Mask.getOpcode() == ISD::BITCAST)
+ Mask = Mask.getOperand(0);
+ if (X.getOpcode() == ISD::BITCAST)
+ X = X.getOperand(0);
+ if (Y.getOpcode() == ISD::BITCAST)
+ Y = Y.getOperand(0);
+
+ EVT MaskVT = Mask.getValueType();
+
+ // Validate that the Mask operand is a vector sra node.
+ // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+ // there is no psrai.b
+ unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+ unsigned SraAmt = ~0;
+ if (Mask.getOpcode() == ISD::SRA) {
+ SDValue Amt = Mask.getOperand(1);
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
+ SraAmt = C->getZExtValue();
+ }
+ } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+ SDValue SraC = Mask.getOperand(1);
+ SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+ }
+ if ((SraAmt + 1) != EltBits)
+ return SDValue();
+
+ DebugLoc DL = N->getDebugLoc();
+
+ // Now we know we at least have a plendvb with the mask val. See if
+ // we can form a psignb/w/d.
+ // psign = x.type == y.type == mask.type && y = sub(0, x);
+ if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+ ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+ X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Unsupported VT for PSIGN");
+ Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+ }
+ // PBLENDVB only available on SSE 4.1
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+ X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
+ Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
+ Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+ }
+ }
+
+ if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+ if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+ std::swap(N0, N1);
+ if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+ return SDValue();
+ if (!N0.hasOneUse() || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue ShAmt0 = N0.getOperand(1);
+ if (ShAmt0.getValueType() != MVT::i8)
+ return SDValue();
+ SDValue ShAmt1 = N1.getOperand(1);
+ if (ShAmt1.getValueType() != MVT::i8)
+ return SDValue();
+ if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+ ShAmt0 = ShAmt0.getOperand(0);
+ if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+ ShAmt1 = ShAmt1.getOperand(0);
+
+ DebugLoc DL = N->getDebugLoc();
+ unsigned Opc = X86ISD::SHLD;
+ SDValue Op0 = N0.getOperand(0);
+ SDValue Op1 = N1.getOperand(0);
+ if (ShAmt0.getOpcode() == ISD::SUB) {
+ Opc = X86ISD::SHRD;
+ std::swap(Op0, Op1);
+ std::swap(ShAmt0, ShAmt1);
+ }
+
+ unsigned Bits = VT.getSizeInBits();
+ if (ShAmt1.getOpcode() == ISD::SUB) {
+ SDValue Sum = ShAmt1.getOperand(0);
+ if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+ SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+ if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+ ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+ if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+ return DAG.getNode(Opc, DL, VT,
+ Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+ } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+ ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+ if (ShAmt0C &&
+ ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+ return DAG.getNode(Opc, DL, VT,
+ N0.getOperand(0), N1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+
+ return SDValue();
+}
+
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ if (VT.isInteger() && VT.getSizeInBits() == 8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CMOV.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD &&
+ N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA &&
+ N1.getOperand(0) == N0.getOperand(0))
+ if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+ if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+ // Generate SUB & CMOV.
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, VT), N0.getOperand(0));
+
+ SDValue Ops[] = { N0.getOperand(0), Neg,
+ DAG.getConstant(X86::COND_GE, MVT::i8),
+ SDValue(Neg.getNode(), 1) };
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+ Ops, array_lengthof(Ops));
+ }
+ return SDValue();
+}
+
+// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (Subtarget->hasCMov()) {
+ SDValue RV = performIntegerAbsCombine(N, DAG);
+ if (RV.getNode())
+ return RV;
+ }
+
+ // Try forming BMI if it is available.
+ if (!Subtarget->hasBMI())
+ return SDValue();
+
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
+
+ // Create BLSMSK instructions by finding X ^ (X-1)
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+ isAllOnes(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
+
+ if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+ isAllOnes(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
+
+ return SDValue();
+}
+
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ EVT RegVT = Ld->getValueType(0);
+ EVT MemVT = Ld->getMemoryVT();
+ DebugLoc dl = Ld->getDebugLoc();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned RegSz = RegVT.getSizeInBits();
+
+ // On Sandybridge unaligned 256bit loads are inefficient.
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+ unsigned Alignment = Ld->getAlignment();
+ bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+ if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+ unsigned NumElems = RegVT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ NumElems/2);
+ SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Alignment);
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ std::min(16U, Alignment));
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue NewVec = DAG.getUNDEF(RegVT);
+ NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+ NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+ return DCI.CombineTo(N, NewVec, TF, true);
+ }
+
+ // If this is a vector EXT Load then attempt to optimize it using a
+ // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+ // expansion is still better than scalar code.
+ // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+ // emit a shuffle and a arithmetic shift.
+ // TODO: It is possible to support ZExt by zeroing the undef values
+ // during the shuffle phase or after the shuffle.
+ if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+ (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
+ assert(MemVT != RegVT && "Cannot extend to the same type");
+ assert(MemVT.isVector() && "Must load a vector from memory");
+
+ unsigned NumElems = RegVT.getVectorNumElements();
+ unsigned MemSz = MemVT.getSizeInBits();
+ assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+ return SDValue();
+
+ // All sizes must be a power of two.
+ if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+ return SDValue();
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
+ MVT SclrLoadTy = MVT::i8;
+ for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+ tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+ MVT Tp = (MVT::SimpleValueType)tp;
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+ SclrLoadTy = Tp;
+ }
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+ if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+ return SDValue();
+
+ unsigned loadRegZize = RegSz;
+ if (Ext == ISD::SEXTLOAD && RegSz == 256)
+ loadRegZize /= 2;
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
+ EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+ loadRegZize/SclrLoadTy.getSizeInBits());
+
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ loadRegZize/MemVT.getScalarType().getSizeInBits());
+
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
+
+ // We can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ // Perform a single load.
+ SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+ Ptr, Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
+
+ // Bitcast the loaded value to a vector of the original element type, in
+ // the size of the target vector type.
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+ unsigned SizeRatio = RegSz/MemSz;
+
+ if (Ext == ISD::SEXTLOAD) {
+ // If we have SSE4.1 we can directly emit a VSEXT node.
+ if (Subtarget->hasSSE41()) {
+ SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ return DCI.CombineTo(N, Sext, TF, true);
+ }
+
+ // Otherwise we'll shuffle the small elements in the high bits of the
+ // larger type and perform an arithmetic shift. If the shift is not legal
+ // it's better to scalarize.
+ if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+ return SDValue();
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+ // Build the arithmetic shift.
+ unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+ MemVT.getVectorElementType().getSizeInBits();
+ Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
+ DAG.getConstant(Amt, RegVT));
+
+ return DCI.CombineTo(N, Shuff, TF, true);
+ }
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ // Bitcast to the requested type.
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+ // Replace the original load with the new sequence
+ // and return the new chain.
+ return DCI.CombineTo(N, Shuff, TF, true);
+ }
+
+ return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT VT = St->getValue().getValueType();
+ EVT StVT = St->getMemoryVT();
+ DebugLoc dl = St->getDebugLoc();
+ SDValue StoredVal = St->getOperand(1);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we are saving a concatenation of two XMM registers, perform two stores.
+ // On Sandy Bridge, 256-bit memory operations are executed by two
+ // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+ // memory operation.
+ unsigned Alignment = St->getAlignment();
+ bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
+ if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+ StVT == VT && !IsAligned) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+ SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+
+ SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+ SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), Alignment);
+ SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(),
+ std::min(16U, Alignment));
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store.
+ // First, pack all of the elements in one place. Next, store to memory
+ // in fewer chunks.
+ if (St->isTruncatingStore() && VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+ unsigned SizeRatio = FromSz / ToSz;
+
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+ tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+ MVT Tp = (MVT::SimpleValueType)tp;
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+ StoreType = Tp;
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Ptr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(i));
+ SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ Chains.push_back(Ch);
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
+ }
+
+ // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
+ // the FP state in cases where an emms may be missing.
+ // A preferable solution to the general problem is to figure out the right
+ // places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool NoImplicitFloatOps = F->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+ bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
+ && Subtarget->hasSSE2();
+ if ((VT.isVector() ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+ isa<LoadSDNode>(St->getValue()) &&
+ !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+ St->getChain().hasOneUse() && !St->isVolatile()) {
+ SDNode* LdVal = St->getValue().getNode();
+ LoadSDNode *Ld = 0;
+ int TokenFactorIndex = -1;
+ SmallVector<SDValue, 8> Ops;
+ SDNode* ChainVal = St->getChain().getNode();
+ // Must be a store of a load. We currently handle two cases: the load
+ // is a direct child, and it's under an intervening TokenFactor. It is
+ // possible to dig deeper under nested TokenFactors.
+ if (ChainVal == LdVal)
+ Ld = cast<LoadSDNode>(St->getChain());
+ else if (St->getValue().hasOneUse() &&
+ ChainVal->getOpcode() == ISD::TokenFactor) {
+ for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+ if (ChainVal->getOperand(i).getNode() == LdVal) {
+ TokenFactorIndex = i;
+ Ld = cast<LoadSDNode>(St->getValue());
+ } else
+ Ops.push_back(ChainVal->getOperand(i));
+ }
+ }
+
+ if (!Ld || !ISD::isNormalLoad(Ld))
+ return SDValue();
+
+ // If this is not the MMX case, i.e. we are just turning i64 load/store
+ // into f64 load/store, avoid the transformation if there are multiple
+ // uses of the loaded value.
+ if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
+
+ DebugLoc LdDL = Ld->getDebugLoc();
+ DebugLoc StDL = N->getDebugLoc();
+ // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+ // pair instead.
+ if (Subtarget->is64Bit() || F64IsLegal) {
+ EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Ld->getAlignment());
+ SDValue NewChain = NewLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(NewChain);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+ return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+ St->getPointerInfo(),
+ St->isVolatile(), St->isNonTemporal(),
+ St->getAlignment());
+ }
+
+ // Otherwise, lower to two pairs of 32-bit loads / stores.
+ SDValue LoAddr = Ld->getBasePtr();
+ SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+ Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+ Ld->getPointerInfo().getWithOffset(4),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(),
+ MinAlign(Ld->getAlignment(), 4));
+
+ SDValue NewChain = LoLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(LoLd);
+ Ops.push_back(HiLd);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+
+ LoAddr = St->getBasePtr();
+ HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+ St->getPointerInfo(),
+ St->isVolatile(), St->isNonTemporal(),
+ St->getAlignment());
+ SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+ St->getPointerInfo().getWithOffset(4),
+ St->isVolatile(),
+ St->isNonTemporal(),
+ MinAlign(St->getAlignment(), 4));
+ return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ }
+ return SDValue();
+}
+
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS. A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector. For example, if
+/// A = < float a0, float a1, float a2, float a3 >
+/// and
+/// B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+ // Look for the following pattern: if
+ // A = < float a0, float a1, float a2, float a3 >
+ // B = < float b0, float b1, float b2, float b3 >
+ // and
+ // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+ // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+ // which is A horizontal-op B.
+
+ // At least one of the operands should be a vector shuffle.
+ if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+ RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ EVT VT = LHS.getValueType();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for horizontal add/sub");
+
+ // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+ // operate independently on 128-bit lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ assert((NumLaneElts % 2 == 0) &&
+ "Vector type should have an even number of elements in each lane");
+ unsigned HalfLaneElts = NumLaneElts/2;
+
+ // View LHS in the form
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // If LHS is not a shuffle then pretend it is the shuffle
+ // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+ // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+ // type VT.
+ SDValue A, B;
+ SmallVector<int, 16> LMask(NumElts);
+ if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ A = LHS.getOperand(0);
+ if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ B = LHS.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), LMask.begin());
+ } else {
+ if (LHS.getOpcode() != ISD::UNDEF)
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask[i] = i;
+ }
+
+ // Likewise, view RHS in the form
+ // RHS = VECTOR_SHUFFLE C, D, RMask
+ SDValue C, D;
+ SmallVector<int, 16> RMask(NumElts);
+ if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ C = RHS.getOperand(0);
+ if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ D = RHS.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), RMask.begin());
+ } else {
+ if (RHS.getOpcode() != ISD::UNDEF)
+ C = RHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ RMask[i] = i;
+ }
+
+ // Check that the shuffles are both shuffling the same vectors.
+ if (!(A == C && B == D) && !(A == D && B == C))
+ return false;
+
+ // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+ if (!A.getNode() && !B.getNode())
+ return false;
+
+ // If A and B occur in reverse order in RHS, then "swap" them (which means
+ // rewriting the mask).
+ if (A != C)
+ CommuteVectorShuffleMask(RMask, NumElts);
+
+ // At this point LHS and RHS are equivalent to
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // RHS = VECTOR_SHUFFLE A, B, RMask
+ // Check that the masks correspond to performing a horizontal operation.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int LIdx = LMask[i], RIdx = RMask[i];
+
+ // Ignore any UNDEF components.
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ continue;
+
+ // Check that successive elements are being operated on. If not, this is
+ // not a horizontal operation.
+ unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
+ unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
+ int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
+ if (!(LIdx == Index && RIdx == Index + 1) &&
+ !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+ return false;
+ }
+
+ LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+ return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(LHS, RHS, true))
+ return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+ return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Try to synthesize horizontal subs from subs of shuffles.
+ if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(LHS, RHS, false))
+ return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+ return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+ // F[X]OR(0.0, x) -> x
+ // F[X]OR(x, 0.0) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+ return SDValue();
+}
+
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+ // Only perform optimizations if UnsafeMath is used.
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+ // into FMINC and FMAXC, which are Commutative operations.
+ unsigned NewOp = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unknown opcode");
+ case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
+ case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
+ }
+
+ return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+ // FAND(0.0, x) -> 0.0
+ // FAND(x, 0.0) -> 0.0
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // BT ignores high bits in the bit index operand.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.hasOneUse()) {
+ unsigned BitWidth = Op1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+ if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+ VT.getVectorElementType().getSizeInBits() ==
+ OpVT.getVectorElementType().getSizeInBits()) {
+ return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+ }
+ return SDValue();
+}
+
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+ DebugLoc dl = N->getDebugLoc();
+
+ // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+ // both SSE and AVX2 since there is no sign-extended shift right
+ // operation on a vector with 64-bit elements.
+ //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+ // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+ if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ SDValue N00 = N0.getOperand(0);
+
+ // EXTLOAD has a better solution on AVX2,
+ // it may be replaced with X86ISD::VSEXT node.
+ if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+ if (!ISD::isNormalLoad(N00.getNode()))
+ return SDValue();
+
+ if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+ SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+ N00, N1);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+ }
+ }
+ return SDValue();
+}
+
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (!Subtarget->hasFp256())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT.isVector() && VT.getSizeInBits() == 256) {
+ SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+ (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
+ return SDValue();
+
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+ SDValue C = N->getOperand(2);
+
+ bool NegA = (A.getOpcode() == ISD::FNEG);
+ bool NegB = (B.getOpcode() == ISD::FNEG);
+ bool NegC = (C.getOpcode() == ISD::FNEG);
+
+ // Negative multiplication when NegA xor NegB
+ bool NegMul = (NegA != NegB);
+ if (NegA)
+ A = A.getOperand(0);
+ if (NegB)
+ B = B.getOperand(0);
+ if (NegC)
+ C = C.getOperand(0);
+
+ unsigned Opcode;
+ if (!NegMul)
+ Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+ else
+ Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+ return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
+ // (and (i32 x86isd::setcc_carry), 1)
+ // This eliminates the zext. This transformation is necessary because
+ // ISD::SETCC is always legalized to i8.
+ DebugLoc dl = N->getDebugLoc();
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ if (N0.getOpcode() == ISD::AND &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!C || C->getZExtValue() != 1)
+ return SDValue();
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, VT));
+ }
+ }
+
+ if (VT.is256BitVector()) {
+ SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
+ }
+
+ return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+// x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+ if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ LHS.getValueType(), RHS, LHS.getOperand(1));
+ return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ addV, DAG.getConstant(0, addV.getValueType()), CC);
+ }
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+ if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ RHS.getValueType(), LHS, RHS.getOperand(1));
+ return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ addV, DAG.getConstant(0, addV.getValueType()), CC);
+ }
+ return SDValue();
+}
+
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+ return DAG.getNode(ISD::AND, DL, MVT::i8,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+ DAG.getConstant(1, MVT::i8));
+}
+
+// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+ SDValue EFLAGS = N->getOperand(1);
+
+ if (CC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return MaterializeSETB(DL, NewEFLAGS, DAG);
+ }
+ }
+
+ // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+ // a zext and produces an all-ones bit which is more useful than 0/1 in some
+ // cases.
+ if (CC == X86::COND_B)
+ return MaterializeSETB(DL, EFLAGS, DAG);
+
+ SDValue Flags;
+
+ Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+ if (Flags.getNode()) {
+ SDValue Cond = DAG.getConstant(CC, MVT::i8);
+ return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+ }
+
+ return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest = N->getOperand(1);
+ SDValue EFLAGS = N->getOperand(3);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+ SDValue Flags;
+
+ Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+ if (Flags.getNode()) {
+ SDValue Cond = DAG.getConstant(CC, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+ Flags);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+ const X86TargetLowering *XTLI) {
+ SDValue Op0 = N->getOperand(0);
+ EVT InVT = Op0->getValueType(0);
+
+ // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+ if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+ }
+
+ // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+ // a 32-bit target where SSE doesn't support i64->FP operations.
+ if (Op0.getOpcode() == ISD::LOAD) {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+ EVT VT = Ld->getValueType(0);
+ if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+ ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+ !XTLI->getSubtarget()->is64Bit() &&
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
+ Ld->getChain(), Op0, DAG);
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+ return FILDChain;
+ }
+ }
+ return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
+ X86TargetLowering::DAGCombinerInfo &DCI) {
+ // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+ // the result is either zero or one (depending on the input carry bit).
+ // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+ if (X86::isZeroNode(N->getOperand(0)) &&
+ X86::isZeroNode(N->getOperand(1)) &&
+ // We don't have a good way to replace an EFLAGS use, so only do this when
+ // dead right now.
+ SDValue(N, 1).use_empty()) {
+ DebugLoc DL = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+ SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
+ SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B,MVT::i8),
+ N->getOperand(2)),
+ DAG.getConstant(1, VT));
+ return DCI.CombineTo(N, Res1, CarryOut);
+ }
+
+ return SDValue();
+}
+
+// fold (add Y, (sete X, 0)) -> adc 0, Y
+// (add Y, (setne X, 0)) -> sbb -1, Y
+// (sub (sete X, 0), Y) -> sbb 0, Y
+// (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+ DebugLoc DL = N->getDebugLoc();
+
+ // Look through ZExts.
+ SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+ if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+ return SDValue();
+
+ SDValue SetCC = Ext.getOperand(0);
+ if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+ return SDValue();
+
+ X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ SDValue Cmp = SetCC.getOperand(1);
+ if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+ !X86::isZeroNode(Cmp.getOperand(1)) ||
+ !Cmp.getOperand(0).getValueType().isInteger())
+ return SDValue();
+
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+ DAG.getConstant(1, CmpOp0.getValueType()));
+
+ SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+ if (CC == X86::COND_NE)
+ return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+ DL, OtherVal.getValueType(), OtherVal,
+ DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
+ return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+ DL, OtherVal.getValueType(), OtherVal,
+ DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
+}
+
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // X86 can't encode an immediate LHS of a sub. See if we can push the
+ // negation into a preceding instruction.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+ // If the RHS of the sub is a XOR with one use and a constant, invert the
+ // immediate. Then add one to the LHS of the sub so we can turn
+ // X-Y -> X+~Y+1, saving one register.
+ if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+ isa<ConstantSDNode>(Op1.getOperand(1))) {
+ APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ EVT VT = Op0.getValueType();
+ SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+ Op1.getOperand(0),
+ DAG.getConstant(~XorC, VT));
+ return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+ DAG.getConstant(C->getAPIntValue()+1, VT));
+ }
+ }
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ EVT VT = N->getValueType(0);
+ if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ isHorizontalBinOp(Op0, Op1, true))
+ return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ // (vzext (bitcast (vzext (x)) -> (vzext x)
+ SDValue In = N->getOperand(0);
+ while (In.getOpcode() == ISD::BITCAST)
+ In = In.getOperand(0);
+
+ if (In.getOpcode() != X86ISD::VZEXT)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+ In.getOperand(0));
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+ case ISD::VSELECT:
+ case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
+ case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
+ case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
+ case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
+ case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
+ case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
+ case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
+ case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
+ case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
+ case X86ISD::FXOR:
+ case X86ISD::FOR: return PerformFORCombine(N, DAG);
+ case X86ISD::FMIN:
+ case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
+ case X86ISD::FAND: return PerformFANDCombine(N, DAG);
+ case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
+ case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+ case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
+ case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
+ case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::SHUFP: // Handle all target specific shuffles
+ case X86ISD::PALIGNR:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLHPS:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::VPERMILP:
+ case X86ISD::VPERM2X128:
+ case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
+ }
+
+ return SDValue();
+}
+
+/// isTypeDesirableForOp - Return true if the target has native support for
+/// the specified value type and it is 'desirable' to use the type for the
+/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+/// instruction encodings are longer and some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+ if (!isTypeLegal(VT))
+ return false;
+ if (VT != MVT::i16)
+ return true;
+
+ switch (Opc) {
+ default:
+ return true;
+ case ISD::LOAD:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SUB:
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return false;
+ }
+}
+
+/// IsDesirableToPromoteOp - This method query the target whether it is
+/// beneficial for dag combiner to promote the specified node. If true, it
+/// should return the desired promotion type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+ EVT VT = Op.getValueType();
+ if (VT != MVT::i16)
+ return false;
+
+ bool Promote = false;
+ bool Commute = false;
+ switch (Op.getOpcode()) {
+ default: break;
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ // If the non-extending load has a single use and it's not live out, then it
+ // might be folded.
+ if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
+ Op.hasOneUse()*/) {
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+ // The only case where we'd want to promote LOAD (rather then it being
+ // promoted as an operand is when it's only use is liveout.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ }
+ }
+ Promote = true;
+ break;
+ }
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ Promote = true;
+ break;
+ case ISD::SHL:
+ case ISD::SRL: {
+ SDValue N0 = Op.getOperand(0);
+ // Look out for (store (shl (load), x)).
+ if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+ return false;
+ Promote = true;
+ break;
+ }
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ Commute = true;
+ // fallthrough
+ case ISD::SUB: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ if (!Commute && MayFoldLoad(N1))
+ return false;
+ // Avoid disabling potential load folding opportunities.
+ if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+ return false;
+ if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+ return false;
+ Promote = true;
+ }
+ }
+
+ PVT = MVT::i32;
+ return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ // Helper to match a string separated by whitespace.
+ bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
+ s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+ for (unsigned i = 0, e = args.size(); i != e; ++i) {
+ StringRef piece(*args[i]);
+ if (!s.startswith(piece)) // Check if the piece matches.
+ return false;
+
+ s = s.substr(piece.size());
+ StringRef::size_type pos = s.find_first_not_of(" \t");
+ if (pos == 0) // We matched a prefix.
+ return false;
+
+ s = s.substr(pos);
+ }
+
+ return s.empty();
+ }
+ const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+ std::string AsmStr = IA->getAsmString();
+
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, ";\n");
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ // FIXME: this should verify that we are targeting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical
+ // ops instead of emitting the bswap asm. For now, we don't support 486 or
+ // lower so don't worry about this.
+ // bswap $0
+ if (matchAsm(AsmPieces[0], "bswap", "$0") ||
+ matchAsm(AsmPieces[0], "bswapl", "$0") ||
+ matchAsm(AsmPieces[0], "bswapq", "$0") ||
+ matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
+ matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
+ matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ // rorw $$8, ${0:w} --> llvm.bswap.i16
+ if (CI->getType()->isIntegerTy(16) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
+ matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
+ AsmPieces.clear();
+ const std::string &ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (AsmPieces.size() == 4 &&
+ AsmPieces[0] == "~{cc}" &&
+ AsmPieces[1] == "~{dirflag}" &&
+ AsmPieces[2] == "~{flags}" &&
+ AsmPieces[3] == "~{fpsr}")
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ break;
+ case 3:
+ if (CI->getType()->isIntegerTy(32) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
+ matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
+ matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+ AsmPieces.clear();
+ const std::string &ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (AsmPieces.size() == 4 &&
+ AsmPieces[0] == "~{cc}" &&
+ AsmPieces[1] == "~{dirflag}" &&
+ AsmPieces[2] == "~{flags}" &&
+ AsmPieces[3] == "~{fpsr}")
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ if (CI->getType()->isIntegerTy(64)) {
+ InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+ if (Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
+ matchAsm(AsmPieces[1], "bswap", "%edx") &&
+ matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'f':
+ case 't':
+ case 'u':
+ case 'y':
+ case 'x':
+ case 'Y':
+ case 'l':
+ return C_RegisterClass;
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ return C_Register;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'G':
+ case 'C':
+ case 'e':
+ case 'Z':
+ return C_Other;
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ X86TargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (CallOperandVal == NULL)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ if (CallOperandVal->getType()->isIntegerTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'f':
+ case 't':
+ case 'u':
+ if (type->isFloatingPointTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'y':
+ if (type->isX86_MMXTy() && Subtarget->hasMMX())
+ weight = CW_SpecificReg;
+ break;
+ case 'x':
+ case 'Y':
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+ weight = CW_Register;
+ break;
+ case 'I':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+ if (C->getZExtValue() <= 31)
+ weight = CW_Constant;
+ }
+ break;
+ case 'J':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 63)
+ weight = CW_Constant;
+ }
+ break;
+ case 'K':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+ weight = CW_Constant;
+ }
+ break;
+ case 'L':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+ weight = CW_Constant;
+ }
+ break;
+ case 'M':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 3)
+ weight = CW_Constant;
+ }
+ break;
+ case 'N':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xff)
+ weight = CW_Constant;
+ }
+ break;
+ case 'G':
+ case 'C':
+ if (dyn_cast<ConstantFP>(CallOperandVal)) {
+ weight = CW_Constant;
+ }
+ break;
+ case 'e':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80000000LL) &&
+ (C->getSExtValue() <= 0x7fffffffLL))
+ weight = CW_Constant;
+ }
+ break;
+ case 'Z':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xffffffff)
+ weight = CW_Constant;
+ }
+ break;
+ }
+ return weight;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+ // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+ // 'f' like normal targets.
+ if (ConstraintVT.isFloatingPoint()) {
+ if (Subtarget->hasSSE2())
+ return "Y";
+ if (Subtarget->hasSSE1())
+ return "x";
+ }
+
+ return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result(0, 0);
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 63) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'K':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isInt<8>(C->getSExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 255) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'e': {
+ // 32-bit signed value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getSExtValue())) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+ break;
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ }
+ return;
+ }
+ case 'Z': {
+ // 32-bit unsigned value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ return;
+ }
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+ break;
+ }
+
+ // In any sort of PIC mode addresses need to be computed at runtime by
+ // adding in a register or some sort of table lookup. These can't
+ // be used as immediates.
+ if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+ return;
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ GlobalAddressSDNode *GA = 0;
+ int64_t Offset = 0;
+
+ // Match either (GA), (GA+C), (GA+C1+C2), etc.
+ while (1) {
+ if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+ Offset += GA->getOffset();
+ break;
+ } else if (Op.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ } else if (Op.getOpcode() == ISD::SUB) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += -C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ }
+
+ // Otherwise, this isn't something we can handle, reject it.
+ return;
+ }
+
+ const GlobalValue *GV = GA->getGlobal();
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
+ getTargetMachine())))
+ return;
+
+ Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+ GA->getValueType(0), Offset);
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ EVT VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ // TODO: Slight differences here in allocation order and leaving
+ // RIP in the class. Do they matter any more here than they do
+ // in the normal allocation?
+ case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+ if (Subtarget->is64Bit()) {
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i64 || VT == MVT::f64)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
+ }
+ // 32-bit fallthrough
+ case 'Q': // Q_REGS
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+ break;
+ case 'r': // GENERAL_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, &X86::GR32RegClass);
+ return std::make_pair(0U, &X86::GR64RegClass);
+ case 'R': // LEGACY_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+ if (VT == MVT::i32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ case 'f': // FP Stack registers.
+ // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+ // value to the correct fpstack register class.
+ if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP32RegClass);
+ if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP64RegClass);
+ return std::make_pair(0U, &X86::RFP80RegClass);
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget->hasMMX()) break;
+ return std::make_pair(0U, &X86::VR64RegClass);
+ case 'Y': // SSE_REGS if SSE2 allowed
+ if (!Subtarget->hasSSE2()) break;
+ // FALL THROUGH.
+ case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+ if (!Subtarget->hasSSE1()) break;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(0U, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(0U, &X86::FR64RegClass);
+ // Vector types.
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(0U, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ return std::make_pair(0U, &X86::VR256RegClass);
+ }
+ break;
+ }
+ }
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+ // Not found as a standard register?
+ if (Res.second == 0) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' &&
+ tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' &&
+ Constraint[6] == '}') {
+
+ Res.first = X86::ST0+Constraint[4]-'0';
+ Res.second = &X86::RFP80RegClass;
+ return Res;
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint)) {
+ Res.first = X86::ST0;
+ Res.second = &X86::RFP80RegClass;
+ return Res;
+ }
+
+ // flags -> EFLAGS
+ if (StringRef("{flags}").equals_lower(Constraint)) {
+ Res.first = X86::EFLAGS;
+ Res.second = &X86::CCRRegClass;
+ return Res;
+ }
+
+ // 'A' means EAX + EDX.
+ if (Constraint == "A") {
+ Res.first = X86::EAX;
+ Res.second = &X86::GR32_ADRegClass;
+ return Res;
+ }
+ return Res;
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ if (Res.second->hasType(VT))
+ return Res; // Correct type already, nothing to do.
+
+ // All of the single-register GCC register classes map their values onto
+ // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
+ // really want an 8-bit or 32-bit register, map to the appropriate register
+ // class and return the appropriate register.
+ if (Res.second == &X86::GR16RegClass) {
+ if (VT == MVT::i8 || VT == MVT::i1) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::AL; break;
+ case X86::DX: DestReg = X86::DL; break;
+ case X86::CX: DestReg = X86::CL; break;
+ case X86::BX: DestReg = X86::BL; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = &X86::GR8RegClass;
+ }
+ } else if (VT == MVT::i32 || VT == MVT::f32) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::EAX; break;
+ case X86::DX: DestReg = X86::EDX; break;
+ case X86::CX: DestReg = X86::ECX; break;
+ case X86::BX: DestReg = X86::EBX; break;
+ case X86::SI: DestReg = X86::ESI; break;
+ case X86::DI: DestReg = X86::EDI; break;
+ case X86::BP: DestReg = X86::EBP; break;
+ case X86::SP: DestReg = X86::ESP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = &X86::GR32RegClass;
+ }
+ } else if (VT == MVT::i64 || VT == MVT::f64) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::RAX; break;
+ case X86::DX: DestReg = X86::RDX; break;
+ case X86::CX: DestReg = X86::RCX; break;
+ case X86::BX: DestReg = X86::RBX; break;
+ case X86::SI: DestReg = X86::RSI; break;
+ case X86::DI: DestReg = X86::RDI; break;
+ case X86::BP: DestReg = X86::RBP; break;
+ case X86::SP: DestReg = X86::RSP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = &X86::GR64RegClass;
+ }
+ }
+ } else if (Res.second == &X86::FR32RegClass ||
+ Res.second == &X86::FR64RegClass ||
+ Res.second == &X86::VR128RegClass) {
+ // Handle references to XMM physical registers that got mapped into the
+ // wrong class. This can happen with constraints like {xmm0} where the
+ // target independent register mapper will just pick the first match it can
+ // find, ignoring the required type.
+
+ if (VT == MVT::f32 || VT == MVT::i32)
+ Res.second = &X86::FR32RegClass;
+ else if (VT == MVT::f64 || VT == MVT::i64)
+ Res.second = &X86::FR64RegClass;
+ else if (X86::VR128RegClass.hasType(VT))
+ Res.second = &X86::VR128RegClass;
+ else if (X86::VR256RegClass.hasType(VT))
+ Res.second = &X86::VR256RegClass;
+ }
+
+ return Res;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..2727e22
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,957 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// BSF - Bit scan forward.
+ /// BSR - Bit scan reverse.
+ BSF,
+ BSR,
+
+ /// SHLD, SHRD - Double shift instructions. These correspond to
+ /// X86::SHLDxx and X86::SHRDxx instructions.
+ SHLD,
+ SHRD,
+
+ /// FAND - Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// FOR - Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// FSRL - Bitwise logical right shift of floating point values. These
+ /// corresponds to X86::PSRLDQ.
+ FSRL,
+
+ /// CALL - These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
+
+ /// RDTSC_DAG - This operation implements the lowering for
+ /// readcyclecounter
+ RDTSC_DAG,
+
+ /// X86 compare and logical compare instructions.
+ CMP, COMI, UCOMI,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
+
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+ /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+ /// Operands are two FP values to compare; result is a mask of
+ /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
+ FSETCCss, FSETCCsd,
+
+ /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+ /// result in an integer GPR. Needs masking for scalar result.
+ FGETSIGNx86,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction. It also writes a
+ /// flag result.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// Wrapper - A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ Wrapper,
+
+ /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector
+ /// to an MMX vector. If you think this is too close to the previous
+ /// mnemonic, so do I; blame Intel.
+ MOVDQ2Q,
+
+ /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX
+ /// vector to a GPR.
+ MMX_MOVD2W,
+
+ /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// INSERTPS - Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW, MMX_PINSRW,
+
+ /// PSHUFB - Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// ANDNP - Bitwise Logical AND NOT of Packed FP values.
+ ANDNP,
+
+ /// PSIGN - Copy integer sign.
+ PSIGN,
+
+ /// BLENDV - Blend where the selector is a register.
+ BLENDV,
+
+ /// BLENDI - Blend where the selector is an immediate.
+ BLENDI,
+
+ // SUBUS - Integer sub with unsigned saturation.
+ SUBUS,
+
+ /// HADD - Integer horizontal add.
+ HADD,
+
+ /// HSUB - Integer horizontal sub.
+ HSUB,
+
+ /// FHADD - Floating point horizontal add.
+ FHADD,
+
+ /// FHSUB - Floating point horizontal sub.
+ FHSUB,
+
+ /// UMAX, UMIN - Unsigned integer max and min.
+ UMAX, UMIN,
+
+ /// SMAX, SMIN - Signed integer max and min.
+ SMAX, SMIN,
+
+ /// FMAX, FMIN - Floating point max and min.
+ ///
+ FMAX, FMIN,
+
+ /// FMAXC, FMINC - Commutative FMIN and FMAX.
+ FMAXC, FMINC,
+
+ /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+ /// approximation. Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT, FRCP,
+
+ // TLSADDR - Thread Local Storage.
+ TLSADDR,
+
+ // TLSBASEADDR - Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
+ // TLSCALL - Thread Local Storage. When calling to an OS provided
+ // thunk at the address from an earlier relocation.
+ TLSCALL,
+
+ // EH_RETURN - Exception Handling helpers.
+ EH_RETURN,
+
+ // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
+ TC_RETURN,
+
+ // VZEXT_MOVL - Vector move low and zero extend.
+ VZEXT_MOVL,
+
+ // VSEXT_MOVL - Vector move low and sign extend.
+ VSEXT_MOVL,
+
+ // VZEXT - Vector integer zero-extend.
+ VZEXT,
+
+ // VSEXT - Vector integer signed-extend.
+ VSEXT,
+
+ // VFPEXT - Vector FP extend.
+ VFPEXT,
+
+ // VFPROUND - Vector FP round.
+ VFPROUND,
+
+ // VSHL, VSRL - 128-bit vector logical left / right shift
+ VSHLDQ, VSRLDQ,
+
+ // VSHL, VSRL, VSRA - Vector shift elements
+ VSHL, VSRL, VSRA,
+
+ // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate
+ VSHLI, VSRLI, VSRAI,
+
+ // CMPP - Vector packed double/float comparison.
+ CMPP,
+
+ // PCMP* - Vector integer comparisons.
+ PCMPEQ, PCMPGT,
+
+ // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
+ ADD, SUB, ADC, SBB, SMUL,
+ INC, DEC, OR, XOR, AND,
+
+ BLSI, // BLSI - Extract lowest set isolated bit
+ BLSMSK, // BLSMSK - Get mask up to lowest set bit
+ BLSR, // BLSR - Reset lowest set bit
+
+ UMUL, // LOW, HI, FLAGS = umul LHS, RHS
+
+ // MUL_IMM - X86 specific multiply by immediate.
+ MUL_IMM,
+
+ // PTEST - Vector bitwise comparisons
+ PTEST,
+
+ // TESTP - Vector packed fp sign bitwise comparisons
+ TESTP,
+
+ // Several flavors of instructions with vector shuffle behaviors.
+ PALIGNR,
+ PSHUFD,
+ PSHUFHW,
+ PSHUFLW,
+ SHUFP,
+ MOVDDUP,
+ MOVSHDUP,
+ MOVSLDUP,
+ MOVLHPS,
+ MOVLHPD,
+ MOVHLPS,
+ MOVLPS,
+ MOVLPD,
+ MOVSD,
+ MOVSS,
+ UNPCKL,
+ UNPCKH,
+ VPERMILP,
+ VPERMV,
+ VPERMI,
+ VPERM2X128,
+ VBROADCAST,
+
+ // PMULUDQ - Vector multiply packed unsigned doubleword integers
+ PMULUDQ,
+
+ // FMA nodes
+ FMADD,
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+
+ // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+ // according to %al. An operator is needed so that this can be expanded
+ // with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // WIN_ALLOCA - Windows's _chkstk call to do stack probing.
+ WIN_ALLOCA,
+
+ // SEG_ALLOCA - For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+
+ // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui.
+ WIN_FTOL,
+
+ // Memory barrier
+ MEMBARRIER,
+ MFENCE,
+ SFENCE,
+ LFENCE,
+
+ // FNSTSW16r - Store FP status word into i16 register.
+ FNSTSW16r,
+
+ // SAHF - Store contents of %ah into %eflags.
+ SAHF,
+
+ // RDRAND - Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
+ // RDSEED - Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
+ // PCMP*STRI
+ PCMPISTRI,
+ PCMPESTRI,
+
+ // XTEST - Test if in transactional execution.
+ XTEST,
+
+ // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
+ // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
+ // Atomic 64-bit binary operations.
+ ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ ATOMSUB64_DAG,
+ ATOMOR64_DAG,
+ ATOMXOR64_DAG,
+ ATOMAND64_DAG,
+ ATOMNAND64_DAG,
+ ATOMMAX64_DAG,
+ ATOMMIN64_DAG,
+ ATOMUMAX64_DAG,
+ ATOMUMIN64_DAG,
+ ATOMSWAP64_DAG,
+
+ // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
+ LCMPXCHG_DAG,
+ LCMPXCHG8_DAG,
+ LCMPXCHG16_DAG,
+
+ // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // FNSTCW16m - Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain).
+ FP_TO_INT16_IN_MEM,
+ FP_TO_INT32_IN_MEM,
+ FP_TO_INT64_IN_MEM,
+
+ /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has three inputs (token chain, address,
+ /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+ /// also produces a flag).
+ FILD,
+ FILD_FLAG,
+
+ /// FLD - This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, ptr to load from, and a ValueType node indicating the type
+ /// to load to.
+ FLD,
+
+ /// FST - This instruction implements a truncating store to FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and a ValueType to store it
+ /// as.
+ FST,
+
+ /// VAARG_64 - This instruction grabs the address of the next argument
+ /// from a va_list. (reads and modifies the va_list in memory)
+ VAARG_64
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
+ // thought as target memory ops!
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// isVEXTRACTF128Index - Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF128.
+ bool isVEXTRACTF128Index(SDNode *N);
+
+ /// isVINSERTF128Index - Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF128.
+ bool isVINSERTF128Index(SDNode *N);
+
+ /// getExtractVEXTRACTF128Immediate - Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF128 instructions.
+ unsigned getExtractVEXTRACTF128Immediate(SDNode *N);
+
+ /// getInsertVINSERTF128Immediate - Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF128 instructions.
+ unsigned getInsertVINSERTF128Immediate(SDNode *N);
+
+ /// isZeroNode - Returns true if Elt is a constant zero or a floating point
+ /// constant +0.0.
+ bool isZeroNode(SDValue Elt);
+
+ /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+ /// fit into displacement field of the instruction.
+ bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement = true);
+
+
+ /// isCalleePop - Determines whether the callee is required to pop its
+ /// own arguments. Callee pop is necessary to support tail calls.
+ bool isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool TailCallOpt);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // X86TargetLowering - X86 Implementation of the TargetLowering interface
+ class X86TargetLowering : public TargetLowering {
+ public:
+ explicit X86TargetLowering(X86TargetMachine &TM);
+
+ virtual unsigned getJumpTableEncoding() const;
+
+ virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+
+ virtual const MCExpr *
+ LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB, unsigned uid,
+ MCContext &Ctx) const;
+
+ /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+ /// jumptable.
+ virtual SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const;
+ virtual const MCExpr *
+ getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI, MCContext &Ctx) const;
+
+ /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ virtual unsigned getByValTypeAlignment(Type *Ty) const;
+
+ /// getOptimalMemOpType - Returns the target specific optimal type for load
+ /// and store operations as a result of memset, memcpy, and memmove
+ /// lowering. If DstAlign is zero that means it's safe to destination
+ /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+ /// means there isn't a need to check it against alignment requirement,
+ /// probably because the source does not need to be loaded. If 'IsMemset' is
+ /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+ /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+ /// source is constant so it does not need to be loaded.
+ /// It returns EVT::Other if the type should be determined using generic
+ /// target-independent logic.
+ virtual EVT
+ getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const;
+
+ /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+ /// specified type to expand memcpy / memset inline. This is mostly true
+ /// for all types except for some special cases. For example, on X86
+ /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+ /// also does type conversion. Note the specified type doesn't have to be
+ /// legal as the hook is used before type legalization.
+ virtual bool isSafeMemOpType(MVT VT) const;
+
+ /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+ /// unaligned memory accesses. of the specified type. Returns whether it
+ /// is "fast" by reference in the second argument.
+ virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ ///
+ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const;
+
+
+ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ /// isTypeDesirableForOp - Return true if the target has native support for
+ /// the specified value type and it is 'desirable' to use the type for the
+ /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+ /// instruction encodings are longer and some i16 instructions are slow.
+ virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const;
+
+ /// isTypeDesirable - Return true if the target has native support for the
+ /// specified value type and it is 'desirable' to use the type. e.g. On x86
+ /// i16 is legal, but undesirable since i16 instruction encodings are longer
+ /// and some i16 instructions are slow.
+ virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const;
+
+ virtual MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ /// DAG node.
+ virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+ /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+ virtual EVT getSetCCResultType(EVT VT) const;
+
+ /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+ /// in Mask are known to be either zero or one and return them in the
+ /// KnownZero/KnownOne bitsets.
+ virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const;
+
+ // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
+ // operation that are sign bits.
+ virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ unsigned Depth) const;
+
+ virtual bool
+ isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ virtual bool ExpandInlineAsm(CallInst *CI) const;
+
+ ConstraintType getConstraintType(const std::string &Constraint) const;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ virtual ConstraintWeight getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const;
+
+ virtual const char *LowerXConstraint(EVT ConstraintVT) const;
+
+ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+ /// vector. If it is invalid, don't add anything to Ops. If hasMemory is
+ /// true it means one of the asm constraint of the inline asm instruction
+ /// being processed is 'm'.
+ virtual void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const;
+
+ /// getRegForInlineAsmConstraint - Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass*>
+ getRegForInlineAsmConstraint(const std::string &Constraint,
+ EVT VT) const;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+
+ /// isLegalICmpImmediate - Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+ /// isLegalAddImmediate - Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ virtual bool isLegalAddImmediate(int64_t Imm) const;
+
+ /// isTruncateFree - Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+ virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+ /// isZExtFree - Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
+ virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+ virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+ /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+ /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+ /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+ /// is expanded to mul + add.
+ virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+
+ /// isNarrowingProfitable - Return true if it's profitable to narrow
+ /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+ /// from i32 to i8 but not from i32 to i16.
+ virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const;
+
+ /// isFPImmLegal - Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+ /// isShuffleMaskLegal - Targets can use this to indicate that they only
+ /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+ /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+ /// values are assumed to be legal.
+ virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const;
+
+ /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+ /// used by Targets can use this to indicate if there is a suitable
+ /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+ /// pool entry.
+ virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const;
+
+ /// ShouldShrinkFPConstant - If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ virtual bool ShouldShrinkFPConstant(EVT VT) const {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !X86ScalarSSEf64 || VT == MVT::f80;
+ }
+
+ const X86Subtarget* getSubtarget() const {
+ return Subtarget;
+ }
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
+ /// for fptoui.
+ bool isTargetFTOL() const {
+ return Subtarget->isTargetWindows() && !Subtarget->is64Bit();
+ }
+
+ /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
+ /// used for fptoui to the given type.
+ bool isIntegerTypeFTOL(EVT VT) const {
+ return isTargetFTOL() && VT == MVT::i64;
+ }
+
+ /// createFastISel - This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const;
+
+ /// getStackCookieLocation - Return true if the target stores stack
+ /// protector cookies at a fixed offset in some non-standard address
+ /// space, and populates the address space and offset as
+ /// appropriate.
+ virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
+
+ SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+ SelectionDAG &DAG) const;
+
+ /// \brief Reset the operation actions based on target options.
+ virtual void resetOperationActions();
+
+ protected:
+ std::pair<const TargetRegisterClass*, uint8_t>
+ findRepresentativeClass(MVT VT) const;
+
+ private:
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+ const X86RegisterInfo *RegInfo;
+ const DataLayout *TD;
+
+ /// Used to store the TargetOptions so that we don't waste time resetting
+ /// the operation actions unless we have to.
+ TargetOptions TO;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf32;
+ bool X86ScalarSSEf64;
+
+ /// LegalFPImmediates - A list of legal fp immediates.
+ std::vector<APFloat> LegalFPImmediates;
+
+ /// addLegalFPImmediate - Indicate that this x86 target can instruction
+ /// select the specified FP immediate natively.
+ void addLegalFPImmediate(const APFloat& Imm) {
+ LegalFPImmediates.push_back(Imm);
+ }
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerMemArgument(SDValue Chain,
+ CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo *MFI,
+ unsigned i) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const;
+
+ // Call lowering helpers.
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Targets which want to do tail call
+ /// optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+ bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall, bool Is64Bit,
+ int FPDiff, DebugLoc dl) const;
+
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool isSigned,
+ bool isReplace) const;
+
+ SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+ SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ int64_t Offset, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToBT(SDValue And, ISD::CondCode CC,
+ DebugLoc dl, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+ // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
+ SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
+ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
+ SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
+
+ virtual SDValue
+ LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ virtual SDValue
+ LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ virtual SDValue
+ LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ DebugLoc dl, SelectionDAG &DAG) const;
+
+ virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+
+ virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+
+ virtual MVT
+ getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const;
+
+ virtual bool
+ CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const;
+
+ /// Utility function to emit atomic-load-arith operations (and, or, xor,
+ /// nand, max, min, umax, umin). It takes the corresponding instruction to
+ /// expand, the associated machine basic block, and the associated X86
+ /// opcodes for reg/reg.
+ MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Utility function to emit atomic-load-arith operations (and, or, xor,
+ /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
+ MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ // Utility function to emit the low-level va_arg code for X86-64.
+ MachineBasicBlock *EmitVAARG64WithCustomInserter(
+ MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Utility function to emit the xmm reg save portion of va_start.
+ MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB,
+ bool Is64Bit) const;
+
+ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Emit nodes that will be selected as "test Op0,Op0", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
+
+ /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG) const;
+
+ /// Convert a comparison if required by the subtarget.
+ SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+ };
+
+ namespace X86 {
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+ }
+}
+
+#endif // X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 0000000..ba1aede
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+ : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+ Has3DNow0F0FOpcode {
+ // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+ let isAsmParserOnly = 1;
+ let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+ Has3DNow0F0FOpcode {
+ // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+ let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn))
+ (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+ [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+ "prefetch\t$addr",
+ [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+ [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+ Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 0000000..225e972
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1361 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let neverHasSideEffects = 1 in
+def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins i32mem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize;
+let isReMaterializable = 1 in
+def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins i32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ Requires<[In32BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+ Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+ // Memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Register reads (implicit or explicit).
+ ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src)),
+ (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
+ "mul{w}\t$src",
+ [], IIC_MUL16_REG>, OpSize, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
+ "mul{l}\t$src",
+ [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+ IIC_MUL32_REG>, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+ "mul{q}\t$src",
+ [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+ IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+ "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src))),
+ (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+ "mul{w}\t$src",
+ [], IIC_MUL16_MEM>, OpSize, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+ "mul{l}\t$src",
+ [], IIC_MUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+ "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let neverHasSideEffects = 1 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [],
+ IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [],
+ IIC_IMUL16_RR>, OpSize, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [],
+ IIC_IMUL32_RR>, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+ IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+ "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+ "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize,
+ SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+ "imul{l}\t$src", [], IIC_IMUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+ "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // neverHasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+ TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+ TB;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+ TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, (load addr:$src2)))],
+ IIC_IMUL16_RM>,
+ TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, (load addr:$src2)))],
+ IIC_IMUL32_RM>,
+ TB;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, (load addr:$src2)))],
+ IIC_IMUL64_RM>,
+ TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))],
+ IIC_IMUL16_RRI>, OpSize;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+ IIC_IMUL16_RRI>,
+ OpSize;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, imm:$src2))],
+ IIC_IMUL32_RRI>;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+ IIC_IMUL32_RRI>;
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+ IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+ IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL16_RMI>,
+ OpSize;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+ OpSize;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL32_RMI>;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i32immSExt8:$src2))],
+ IIC_IMUL32_RMI>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt32:$src2))],
+ IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt8:$src2))],
+ IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w}\t$src", [], IIC_DIV16>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l}\t$src", [], IIC_DIV32>;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+ "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b}\t$src", [], IIC_DIV8_MEM>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w}\t$src", [], IIC_DIV16>, OpSize,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+ "div{l}\t$src", [], IIC_DIV32>,
+ SchedLoadReg<WriteIDivLd>;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+ "div{q}\t$src", [], IIC_DIV64>,
+ SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l}\t$src", [], IIC_IDIV32>;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+ "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b}\t$src", [], IIC_IDIV8>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+ "idiv{l}\t$src", [], IIC_IDIV32>,
+ SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+ "idiv{q}\t$src", [], IIC_IDIV64>,
+ SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "neg{b}\t$dst",
+ [(set GR8:$dst, (ineg GR8:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "neg{w}\t$dst",
+ [(set GR16:$dst, (ineg GR16:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "neg{l}\t$dst",
+ [(set GR32:$dst, (ineg GR32:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+ [(set GR64:$dst, (ineg GR64:$src1)),
+ (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+ "neg{b}\t$dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+ "neg{w}\t$dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+ "neg{l}\t$dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "not{b}\t$dst",
+ [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "not{w}\t$dst",
+ [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "not{l}\t$dst",
+ [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+ [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+ "not{b}\t$dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+ "not{w}\t$dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ OpSize;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+ "not{l}\t$dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "inc{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+ IIC_UNARY_REG>;
+
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
+ OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+ IIC_UNARY_REG>,
+ Requires<[In32BitMode]>;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+ IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 1
+
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+ IIC_UNARY_REG>,
+ OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+ IIC_UNARY_REG>,
+ Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+ IIC_UNARY_REG>,
+ OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+ IIC_UNARY_REG>,
+ Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+ def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+ def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ OpSize, Requires<[In32BitMode]>;
+ def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ Requires<[In32BitMode]>;
+ def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+// FIXME: What is this for??
+def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ OpSize, Requires<[In64BitMode]>;
+def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ Requires<[In64BitMode]>;
+def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ OpSize, Requires<[In64BitMode]>;
+def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ Requires<[In64BitMode]>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "dec{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+ IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+ IIC_UNARY_REG>,
+ OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+ IIC_UNARY_REG>,
+ Requires<[In32BitMode]>;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+ IIC_UNARY_REG>;
+} // CodeSize = 2
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+ def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+ def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ OpSize, Requires<[In32BitMode]>;
+ def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ Requires<[In32BitMode]>;
+ def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types. For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+ PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+ Operand immoperand, SDPatternOperator immoperator,
+ Operand imm8operand, SDPatternOperator imm8operator,
+ bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> {
+ /// VT - This is the value type itself.
+ ValueType VT = vt;
+
+ /// InstrSuffix - This is the suffix used on instructions with this type. For
+ /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+ string InstrSuffix = instrsuffix;
+
+ /// RegClass - This is the register class associated with this type. For
+ /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+ RegisterClass RegClass = regclass;
+
+ /// LoadNode - This is the load node associated with this type. For
+ /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+ PatFrag LoadNode = loadnode;
+
+ /// MemOperand - This is the memory operand associated with this type. For
+ /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+ X86MemOperand MemOperand = memoperand;
+
+ /// ImmEncoding - This is the encoding of an immediate of this type. For
+ /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32
+ /// since the immediate fields of i64 instructions is a 32-bit sign extended
+ /// value.
+ ImmType ImmEncoding = immkind;
+
+ /// ImmOperand - This is the operand kind of an immediate of this type. For
+ /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 ->
+ /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+ /// extended value.
+ Operand ImmOperand = immoperand;
+
+ /// ImmOperator - This is the operator that should be used to match an
+ /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+ SDPatternOperator ImmOperator = immoperator;
+
+ /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+ /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is
+ /// only used for instructions that have a sign-extended imm8 field form.
+ Operand Imm8Operand = imm8operand;
+
+ /// Imm8Operator - This is the operator that should be used to match an 8-bit
+ /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+ SDPatternOperator Imm8Operator = imm8operator;
+
+ /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+ /// opposed to even) opcode. Operations on i8 are usually even, operations on
+ /// other datatypes are odd.
+ bit HasOddOpcode = hasOddOpcode;
+
+ /// HasOpSizePrefix - This bit is set to true if the instruction should have
+ /// the 0x66 operand size prefix. This is set for i16 types.
+ bit HasOpSizePrefix = hasOpSizePrefix;
+
+ /// HasREX_WPrefix - This bit is set to true if the instruction should have
+ /// the 0x40 REX prefix. This is set for i64 types.
+ bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
+ Imm8 , i8imm , imm, i8imm , invalid_node,
+ 0, 0, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+ Imm16, i16imm, imm, i16i8imm, i16immSExt8,
+ 1, 1, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+ Imm32, i32imm, imm, i32i8imm, i32immSExt8,
+ 1, 0, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+ Imm32, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+ 1, 0, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+/// suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+/// or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+ string mnemonic, string args, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
+ : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+ opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+ f, outs, ins,
+ !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+ itin> {
+
+ // Infer instruction prefixes from type info.
+ let hasOpSizePrefix = typeinfo.HasOpSizePrefix;
+ let hasREX_WPrefix = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, list<dag> pattern, InstrItinClass itin,
+ Format f = MRMDestReg>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+ Sched<[WriteALU]>;
+
+// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has
+// just a regclass (no eflags) as a result.
+class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f = MRMDestReg>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+ IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+ EFLAGS))], IIC_BIN_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : ITy<opcode, MRMSrcReg, typeinfo,
+ (outs typeinfo.RegClass:$dst),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, list<dag> pattern>
+ : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+ Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+ EFLAGS))]>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, list<dag> pattern>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+ Sched<[WriteALU]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_R - Instructions like "add reg, reg, imm".
+class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+ EFLAGS))]>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, list<dag> pattern>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+ Sched<[WriteALU]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_R - Instructions like "add reg, reg, imm8".
+class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+ EFLAGS))]>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ list<dag> pattern>
+ : ITy<opcode, MRMDestMem, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+ Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern, bits<8> opcode = 0x80>
+ : ITy<opcode, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+ Sched<[WriteALULd, WriteRMW]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f, bits<8> opcode = 0x80>
+ : BinOpMI<mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src))],
+ opcode>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern>
+ : ITy<0x82, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+ Sched<[WriteALULd, WriteRMW]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm".
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : ITy<opcode, RawFrm, typeinfo,
+ (outs), (ins typeinfo.ImmOperand:$src),
+ mnemonic, operands, []>, Sched<[WriteALU]> {
+ let ImmT = typeinfo.ImmEncoding;
+ let Uses = [areg];
+ let Defs = [areg];
+ let hasSideEffects = 0;
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnodeflag, SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR,
+ isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+ def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+ def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+ }
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode, bit CommutableRR,
+ bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR,
+ isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+ def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+ }
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...". It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let isCommutable = CommutableRR,
+ isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+ def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+ def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+
+ def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+ }
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+ X86and_flag, and, 1, 0>;
+defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+ X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+ X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+ X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+ X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+let Uses = [EFLAGS] in {
+ defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+ 1, 0>;
+ defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+ 0, 0>;
+}
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result. From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1, Defs = [EFLAGS] in {
+ let isCommutable = 1 in {
+ def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
+ def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
+ def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
+ def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
+ } // isCommutable
+
+ def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+ def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+ def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+ def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+ def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+ def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
+ def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
+ def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
+ def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+
+ def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+
+ // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+ // register class is constrained to GR8_NOREX.
+ let isPseudo = 1 in
+ def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+ "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+}
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+ PatFrag ld_frag> {
+ def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+ IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS,
+ (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+ Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V;
+ defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+ def : Pat<(and (not GR32:$src1), GR32:$src2),
+ (ANDN32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(and (not GR64:$src1), GR64:$src2),
+ (ANDN64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+ (ANDN32rm GR32:$src1, addr:$src2)>;
+ def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+ (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+ let isCommutable = 1 in
+ def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>;
+
+ let mayLoad = 1 in
+ def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ let Uses = [EDX] in
+ defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+ let Uses = [RDX] in
+ defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+ let SchedRW = [WriteALU] in {
+ def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "adcx{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8, OpSize;
+
+ def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "adcx{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
+ def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "adcx{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8, OpSize;
+
+ def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "adcx{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+ let SchedRW = [WriteALU] in {
+ def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8XS;
+
+ def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
+ def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8XS;
+
+ def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..aaef4a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,183 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ int Disp;
+ const GlobalValue *GV;
+ unsigned GVOpFlags;
+
+ X86AddressMode()
+ : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+ Base.Reg = 0;
+ }
+
+
+ void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+ assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+ if (BaseType == X86AddressMode::RegBase)
+ MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
+ false, false, false, 0, false));
+ else {
+ assert(BaseType == X86AddressMode::FrameIndexBase);
+ MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+ }
+
+ MO.push_back(MachineOperand::CreateImm(Scale));
+ MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
+ false, false, false, 0, false));
+
+ if (GV)
+ MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+ else
+ MO.push_back(MachineOperand::CreateImm(Disp));
+
+ MO.push_back(MachineOperand::CreateReg(0, false, false,
+ false, false, false, 0, false));
+ }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+ // Because memory references are always represented with five
+ // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill, int Offset) {
+ return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, bool isKill1,
+ unsigned Reg2, bool isKill2) {
+ return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+ .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else {
+ assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ }
+
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+ else
+ MIB.addImm(AM.Disp);
+
+ return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ unsigned Flags = 0;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset),
+ Flags, MFI.getObjectSize(FI),
+ MFI.getObjectAlignment(FI));
+ return addOffset(MIB.addFrameIndex(FI), Offset)
+ .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ unsigned GlobalBaseReg, unsigned char OpFlags) {
+ //FIXME: factor this
+ return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+ .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 0000000..a967a4d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,111 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// SetCC instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ isCommutable = 1, SchedRW = [WriteALU] in {
+ def NAME#16rr
+ : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+ IIC_CMOV16_RR>,TB,OpSize;
+ def NAME#32rr
+ : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB;
+ def NAME#64rr
+ :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR64:$dst,
+ (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB;
+ }
+
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteALULd, ReadAfterLd] in {
+ def NAME#16rm
+ : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV16_RM>,
+ TB, OpSize;
+ def NAME#32rm
+ : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+ def NAME#64rm
+ :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+ } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+ let Uses = [EFLAGS] in {
+ def r : I<opc, MRM0r, (outs GR8:$dst), (ins),
+ !strconcat(Mnemonic, "\t$dst"),
+ [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+ IIC_SET_R>, TB, Sched<[WriteALU]>;
+ def m : I<opc, MRM0m, (outs), (ins i8mem:$dst),
+ !strconcat(Mnemonic, "\t$dst"),
+ [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+ IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+ } // Uses = [EFLAGS]
+}
+
+defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set
+defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal
+defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal
+defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than
+defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed
+defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set
+defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal
+defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 0000000..d9ff0c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1839 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 32 bits.
+ return getI32Imm((unsigned)N->getZExtValue());
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 8 bits.
+ return getI8Imm((uint8_t)N->getZExtValue());
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction. This expands to code that looks like this:
+// call $next_inst
+// popl %destreg"
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+ def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+ "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(X86callseq_start timm:$amt)]>,
+ Requires<[In32BitMode]>;
+def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[In32BitMode]>;
+}
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(X86callseq_start timm:$amt)]>,
+ Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[In64BitMode]>;
+}
+
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1 in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+ (outs),
+ (ins GR8:$al,
+ i64imm:$regsavefi, i64imm:$offset,
+ variable_ops),
+ "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+ [(X86vastart_save_xmm_regs GR8:$al,
+ imm:$regsavefi,
+ imm:$offset)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+ (outs GR64:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_64 $dst, $ap, $size, $mode, $align",
+ [(set GR64:$dst,
+ (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+ (implicit EFLAGS)]>;
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets. These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+ def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
+ "# dynamic stack allocation",
+ [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR32:$dst,
+ (X86SegAlloca GR32:$size))]>,
+ Requires<[In32BitMode]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR64:$dst,
+ (X86SegAlloca GR64:$size))]>,
+ Requires<[In64BitMode]>;
+}
+
+// The MSVC runtime contains an _ftol2 routine for converting floating-point
+// to integer values. It has a strange calling convention: the input is
+// popped from the x87 stack, and the return value is given in EDX:EAX. No
+// other registers (aside from flags) are touched.
+// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
+// variant is unnecessary.
+
+let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
+ def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
+ "# win32 fptoui",
+ [(X86WinFTOL RFP32:$src)]>,
+ Requires<[In32BitMode]>;
+
+ def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
+ "# win32 fptoui",
+ [(X86WinFTOL RFP64:$src)]>,
+ Requires<[In32BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In32BitMode]>;
+ def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+ "#EH_SJLJ_SETJMP64",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In64BitMode]>;
+ let isTerminator = 1 in {
+ def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In32BitMode]>;
+ def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+ "#EH_SJLJ_LONGJMP64",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In64BitMode]>;
+ }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+ def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+ "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower. We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+ "", []>;
+
+// This instruction is lowered to a RET followed by a MOV. The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+ "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: Set encoding to pseudo.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isCodeGenOnly = 1 in {
+def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
+ [(set GR8:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
+// encoding and avoids a partial-register update sometimes, but doing so
+// at isel time interferes with rematerialization in the current register
+// allocator. For now, this is rewritten when the instruction is lowered
+// to an MCInst.
+def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
+ "",
+ [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize,
+ Sched<[WriteZero]>;
+
+// FIXME: Set encoding to pseudo.
+def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+}
+
+// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
+// smaller encoding, but doing so at isel time interferes with rematerialization
+// in the current register allocator. For now, this is rewritten when the
+// instruction is lowered to an MCInst.
+// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let Defs = [EFLAGS], isCodeGenOnly=1,
+ AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
+ [(set GR64:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isCodeGenOnly = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
+ "", [(set GR64:$dst, i64immZExt32:$src)],
+ IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+ [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+ [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+ [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type. When
+// this happens, it is great. However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+ (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+ (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+ (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+ (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+ (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+ (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ Requires<[In32BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+ Requires<[In32BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+ Requires<[In32BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+ Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+ [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+ Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,ECX,EDI] in
+ def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ Requires<[In32BitMode]>;
+ let Uses = [AX,ECX,EDI] in
+ def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+ Requires<[In32BitMode]>;
+ let Uses = [EAX,ECX,EDI] in
+ def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+ Requires<[In32BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,RCX,RDI] in
+ def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+ let Uses = [AX,RCX,RDI] in
+ def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+ Requires<[In64BitMode]>;
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+ [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+ Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addr32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[In32BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addr32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[In32BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_addr64",
+ [(X86tlsaddr tls64addr:$sym)]>,
+ Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_base_addr64",
+ [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+ Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax. %ecx is trashed during the function
+// call. All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+ Uses = [ESP],
+ usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLSCall_32",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[In32BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, on return
+// the address of the variable is in %rax. All other registers are preserved.
+let Defs = [RAX, EFLAGS],
+ Uses = [RSP, RDI],
+ usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLSCall_64",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// X86 doesn't have 8-bit conditional moves. Use a customInserter to
+// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+// however that requires promoting the operands, and can induce additional
+// i8 register pressure.
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+def CMOV_GR8 : I<0, Pseudo,
+ (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
+ "#CMOV_GR8 PSEUDO!",
+ [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
+ imm:$cond, EFLAGS))]>;
+
+let Predicates = [NoCMov] in {
+def CMOV_GR32 : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
+ "#CMOV_GR32* PSEUDO!",
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_GR16 : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
+ "#CMOV_GR16* PSEUDO!",
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+} // Predicates = [NoCMov]
+
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE1.
+let Predicates = [FPStackf32] in
+def CMOV_RFP32 : I<0, Pseudo,
+ (outs RFP32:$dst),
+ (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
+ "#CMOV_RFP32 PSEUDO!",
+ [(set RFP32:$dst,
+ (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
+ EFLAGS))]>;
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE2.
+let Predicates = [FPStackf64] in
+def CMOV_RFP64 : I<0, Pseudo,
+ (outs RFP64:$dst),
+ (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
+ "#CMOV_RFP64 PSEUDO!",
+ [(set RFP64:$dst,
+ (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
+ EFLAGS))]>;
+def CMOV_RFP80 : I<0, Pseudo,
+ (outs RFP80:$dst),
+ (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
+ "#CMOV_RFP80 PSEUDO!",
+ [(set RFP80:$dst,
+ (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
+ EFLAGS))]>;
+} // UsesCustomInserter = 1, Uses = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic Instruction Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo atomic instructions
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
+ let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
+ let Defs = [EFLAGS, AL] in
+ def NAME#8 : I<0, Pseudo, (outs GR8:$dst),
+ (ins i8mem:$ptr, GR8:$val),
+ !strconcat(mnemonic, "8 PSEUDO!"), []>;
+ let Defs = [EFLAGS, AX] in
+ def NAME#16 : I<0, Pseudo,(outs GR16:$dst),
+ (ins i16mem:$ptr, GR16:$val),
+ !strconcat(mnemonic, "16 PSEUDO!"), []>;
+ let Defs = [EFLAGS, EAX] in
+ def NAME#32 : I<0, Pseudo, (outs GR32:$dst),
+ (ins i32mem:$ptr, GR32:$val),
+ !strconcat(mnemonic, "32 PSEUDO!"), []>;
+ let Defs = [EFLAGS, RAX] in
+ def NAME#64 : I<0, Pseudo, (outs GR64:$dst),
+ (ins i64mem:$ptr, GR64:$val),
+ !strconcat(mnemonic, "64 PSEUDO!"), []>;
+ }
+}
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
+ def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
+ (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
+ def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
+ (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
+ def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
+ (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
+ def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
+ (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
+}
+
+// Atomic exchange, and, or, xor
+defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
+defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
+defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
+defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
+defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
+
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
+ let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX],
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
+ def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ !strconcat(mnemonic, "6432 PSEUDO!"), []>;
+}
+
+defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
+defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
+defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
+defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
+defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
+defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
+defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
+defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+ "or{l}\t{$zero, $dst|$dst, $zero}",
+ [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK,
+ Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+ "#MEMBARRIER",
+ [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+ Format ImmMod, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+ MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, LOCK;
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, OpSize, LOCK;
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, LOCK;
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+ ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize, LOCK;
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
+
+// Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
+ !strconcat(mnemonic, "{b}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+ !strconcat(mnemonic, "{w}\t$dst"),
+ [], IIC_UNARY_MEM>, OpSize, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+ !strconcat(mnemonic, "{l}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+ !strconcat(mnemonic, "{q}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+ SDPatternOperator frag, X86MemOperand x86memop,
+ InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+ def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+ !strconcat(mnemonic, "\t$ptr"),
+ [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ string mnemonic, SDPatternOperator frag,
+ InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+ let Defs = [AL, EFLAGS], Uses = [AL] in
+ def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+ !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+ let Defs = [AX, EFLAGS], Uses = [AX] in
+ def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+ !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+ let Defs = [EAX, EFLAGS], Uses = [EAX] in
+ def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+ !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+ let Defs = [RAX, EFLAGS], Uses = [RAX] in
+ def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+ !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+ SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+ X86cas8, i64mem,
+ IIC_CMPX_LOCK_8B>;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+ Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+ X86cas16, i128mem,
+ IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+ X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+ string frag,
+ InstrItinClass itin8, InstrItinClass itin> {
+ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin8>;
+ def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize;
+ def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>;
+ def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+ itin>;
+ }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+ IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+ TB, LOCK;
+
+def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+ "#ACQUIRE_MOV PSEUDO!",
+ [(set GR8:$dst, (atomic_load_8 addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+ "#ACQUIRE_MOV PSEUDO!",
+ [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+ "#ACQUIRE_MOV PSEUDO!",
+ [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+ "#ACQUIRE_MOV PSEUDO!",
+ [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+ "#RELEASE_MOV PSEUDO!",
+ [(atomic_store_8 addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+ "#RELEASE_MOV PSEUDO!",
+ [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+ "#RELEASE_MOV PSEUDO!",
+ [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+ "#RELEASE_MOV PSEUDO!",
+ [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions.
+//===----------------------------------------------------------------------===//
+
+
+// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+ def CMOV_FR32 : I<0, Pseudo,
+ (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+ "#CMOV_FR32 PSEUDO!",
+ [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+ EFLAGS))]>;
+ def CMOV_FR64 : I<0, Pseudo,
+ (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+ "#CMOV_FR64 PSEUDO!",
+ [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+ EFLAGS))]>;
+ def CMOV_V4F32 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V4F32 PSEUDO!",
+ [(set VR128:$dst,
+ (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V2F64 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2F64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V2I64 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2I64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V8F32 : I<0, Pseudo,
+ (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+ "#CMOV_V8F32 PSEUDO!",
+ [(set VR256:$dst,
+ (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V4F64 : I<0, Pseudo,
+ (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+ "#CMOV_V4F64 PSEUDO!",
+ [(set VR256:$dst,
+ (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V4I64 : I<0, Pseudo,
+ (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+ "#CMOV_V4I64 PSEUDO!",
+ [(set VR256:$dst,
+ (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+ EFLAGS)))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+ (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+ (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+ (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+ (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+ (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'. FIXME: This is really a hack, the
+// 'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri tconstpool :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri tjumptable :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In static codegen with small code model, we can get the address of a label
+// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri64i32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+ Requires<[NearData, IsStatic]>;
+
+
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+ (MOV64ri tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+ (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+ (X86tcret node:$ptr, node:$off), [{
+ // X86tcret args: (*chain, ptr, imm, regs..., glue)
+ unsigned NumRegs = 0;
+ for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+ if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+ return false;
+ return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[In32BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+ (TCRETURNmi addr:$dst, imm:$off)>,
+ Requires<[In32BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>,
+ Requires<[In32BitMode]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>,
+ Requires<[In32BitMode]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+ (TCRETURNmi64 addr:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+ (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+ (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+ Instruction Inst64> {
+ let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+ (Inst16 GR16:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+ (Inst32 GR32:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+ (Inst64 GR64:$src2, addr:$src1)>;
+ }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
+ sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+ (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg &&
+ N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies. However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read. To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+ APInt KnownZero0, KnownOne0;
+ CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+ APInt KnownZero1, KnownOne1;
+ CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+ return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+ Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "", // orw/addw REG, REG
+ [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "", // orl/addl REG, REG
+ [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "", // orq/addq REG, REG
+ [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "", // orw/addw REG, imm8
+ [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "", // orw/addw REG, imm
+ [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "", // orl/addl REG, imm8
+ [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "", // orl/addl REG, imm
+ [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "", // orq/addq REG, imm8
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "", // orq/addq REG, imm
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+ (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+ (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+ (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+ (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri8
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo8XForm imm:$imm))),
+ sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo32XForm imm:$imm))),
+ sub_32bit)>;
+
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+ (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[In32BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+ (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+ sub_16bit)>,
+ Requires<[In32BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+ (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+ (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+ (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+ Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (i8
+ (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+ Requires<[In64BitMode]>;
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+ (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[In32BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+ (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+ sub_16bit)>,
+ Requires<[In32BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+ (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+ (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+ (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+ Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (MOVSX32rr8
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+ Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+ (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+ (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[In32BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32rr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
+def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
+
+// (shl x (and y, 31)) ==> (shl x, y)
+def : Pat<(shl GR8:$src1, (and CL, immShift32)),
+ (SHL8rCL GR8:$src1)>;
+def : Pat<(shl GR16:$src1, (and CL, immShift32)),
+ (SHL16rCL GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (and CL, immShift32)),
+ (SHL32rCL GR32:$src1)>;
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SHL8mCL addr:$dst)>;
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SHL16mCL addr:$dst)>;
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SHL32mCL addr:$dst)>;
+
+def : Pat<(srl GR8:$src1, (and CL, immShift32)),
+ (SHR8rCL GR8:$src1)>;
+def : Pat<(srl GR16:$src1, (and CL, immShift32)),
+ (SHR16rCL GR16:$src1)>;
+def : Pat<(srl GR32:$src1, (and CL, immShift32)),
+ (SHR32rCL GR32:$src1)>;
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SHR8mCL addr:$dst)>;
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SHR16mCL addr:$dst)>;
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SHR32mCL addr:$dst)>;
+
+def : Pat<(sra GR8:$src1, (and CL, immShift32)),
+ (SAR8rCL GR8:$src1)>;
+def : Pat<(sra GR16:$src1, (and CL, immShift32)),
+ (SAR16rCL GR16:$src1)>;
+def : Pat<(sra GR32:$src1, (and CL, immShift32)),
+ (SAR32rCL GR32:$src1)>;
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SAR8mCL addr:$dst)>;
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SAR16mCL addr:$dst)>;
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (SAR32mCL addr:$dst)>;
+
+// (shl x (and y, 63)) ==> (shl x, y)
+def : Pat<(shl GR64:$src1, (and CL, immShift64)),
+ (SHL64rCL GR64:$src1)>;
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (SHL64mCL addr:$dst)>;
+
+def : Pat<(srl GR64:$src1, (and CL, immShift64)),
+ (SHR64rCL GR64:$src1)>;
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (SHR64mCL addr:$dst)>;
+
+def : Pat<(sra GR64:$src1, (and CL, immShift64)),
+ (SAR64rCL GR64:$src1)>;
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (SAR64mCL addr:$dst)>;
+
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+
+
+
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+ (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+ (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+ (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+ (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+ (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+ (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+ (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+ (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+ (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+ (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+ (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+ (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+ (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+ (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+ (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+ (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+ (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+ (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+ (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+ (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+ (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment reg.
+def : Pat<(add GR8 :$src, 1), (INC8r GR8 :$src)>;
+def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
+
+// Decrement reg.
+def : Pat<(add GR8 :$src, -1), (DEC8r GR8 :$src)>;
+def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+ (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+ (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+ (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+ (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+ (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+ (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+ (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+ (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+ (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+ (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+ (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+ (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+ (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+ (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+ (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+ (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+ (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+ (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+ (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+ (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+ (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+ (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+ (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 0000000..0e69651
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,287 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+ def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret",
+ [(X86retflag 0)], IIC_RET>;
+ def RETW : I <0xC3, RawFrm, (outs), (ins),
+ "ret{w}",
+ [], IIC_RET>, OpSize;
+ def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret\t$amt",
+ [(X86retflag timm:$amt)], IIC_RET_IMM>;
+ def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+ "ret{w}\t$amt",
+ [], IIC_RET_IMM>, OpSize;
+ def LRETL : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{l|f}", [], IIC_RET>;
+ def LRETW : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{w|f}", [], IIC_RET>, OpSize;
+ def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{q|f}", [], IIC_RET>;
+ def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{l|f}\t$amt", [], IIC_RET>;
+ def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+ def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+ "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+ def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+ "jmp\t$dst", [], IIC_JMP_REL>;
+ // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
+ // with JMP_1.
+ def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+ "jmpq\t$dst", [], IIC_JMP_REL>;
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+ multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+ def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
+ IIC_Jcc>;
+ def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
+ [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB;
+ }
+}
+
+defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>;
+defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+ // These are the 32-bit versions of this instruction for the asmparser. In
+ // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+ // jecxz.
+ let Uses = [CX] in
+ def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>;
+ let Uses = [ECX] in
+ def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>;
+
+ // J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
+ // In 64-bit mode, the address size prefix is jecxz and the unprefixed version
+ // is jrcxz.
+ let Uses = [ECX] in
+ def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>;
+ let Uses = [RCX] in
+ def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+ [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>,
+ Sched<[WriteJump]>;
+ def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+ [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[In32BitMode]>, Sched<[WriteJumpLd]>;
+
+ def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+ [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>;
+ def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+ [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+ def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "ljmp{w}\t{$seg, $off|$off, $seg}", [],
+ IIC_JMP_FAR_PTR>, OpSize, Sched<[WriteJump]>;
+ def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "ljmp{l}\t{$seg, $off|$off, $seg}", [],
+ IIC_JMP_FAR_PTR>, Sched<[WriteJump]>;
+ def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+ "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+ Sched<[WriteJump]>;
+
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+ "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize,
+ Sched<[WriteJumpLd]>;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+ "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+ Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. ESP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Uses = [ESP] in {
+ def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i32imm_pcrel:$dst),
+ "call{l}\t$dst", [], IIC_CALL_RI>,
+ Requires<[In32BitMode]>, Sched<[WriteJump]>;
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+ "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+ Requires<[In32BitMode]>, Sched<[WriteJump]>;
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+ IIC_CALL_MEM>,
+ Requires<[In32BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
+
+ def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "lcall{w}\t{$seg, $off|$off, $seg}", [],
+ IIC_CALL_FAR_PTR>, OpSize, Sched<[WriteJump]>;
+ def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "lcall{l}\t{$seg, $off|$off, $seg}", [],
+ IIC_CALL_FAR_PTR>, Sched<[WriteJump]>;
+
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+ "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize,
+ Sched<[WriteJumpLd]>;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+ "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>,
+ Sched<[WriteJumpLd]>;
+
+ // callw for 16 bit code for the assembler.
+ let isAsmParserOnly = 1 in
+ def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+ (outs), (ins i16imm_pcrel:$dst),
+ "callw\t$dst", []>, OpSize;
+ }
+
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ let Uses = [ESP] in {
+ def TCRETURNdi : PseudoI<(outs),
+ (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+ def TCRETURNri : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ let mayLoad = 1 in
+ def TCRETURNmi : PseudoI<(outs),
+ (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+ // FIXME: The should be pseudo instructions that are lowered when going to
+ // mcinst.
+ def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i32imm_pcrel:$dst),
+ "jmp\t$dst # TAILCALL",
+ [], IIC_JMP_REL>;
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
+ let mayLoad = 1 in
+ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+ "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst),
+ "call{q}\t$dst", [], IIC_CALL_RI>,
+ Requires<[In64BitMode]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+ IIC_CALL_RI>,
+ Requires<[In64BitMode]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+ IIC_CALL_MEM>,
+ Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+ def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+ "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isCodeGenOnly = 1 in
+ // __chkstk(MSVC): clobber R10, R11 and EFLAGS.
+ // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
+ let Defs = [RAX, R10, R11, RSP, EFLAGS],
+ Uses = [RSP] in {
+ def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst),
+ "call{q}\t$dst", [], IIC_CALL_RI>,
+ Requires<[IsWin64]>, Sched<[WriteJump]>;
+ }
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+ SchedRW = [WriteJump] in {
+ def TCRETURNdi64 : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ []>;
+ def TCRETURNri64 : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ let mayLoad = 1 in
+ def TCRETURNmi64 : PseudoI<(outs),
+ (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+ def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i64i32imm_pcrel:$dst),
+ "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>;
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+ "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+ "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 0000000..6dc7175
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,186 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in {
+ let Defs = [AX], Uses = [AL] in
+ def CBW : I<0x98, RawFrm, (outs), (ins),
+ "{cbtw|cbw}", []>, OpSize; // AX = signext(AL)
+ let Defs = [EAX], Uses = [AX] in
+ def CWDE : I<0x98, RawFrm, (outs), (ins),
+ "{cwtl|cwde}", []>; // EAX = signext(AX)
+
+ let Defs = [AX,DX], Uses = [AX] in
+ def CWD : I<0x99, RawFrm, (outs), (ins),
+ "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+ let Defs = [EAX,EDX], Uses = [EAX] in
+ def CDQ : I<0x99, RawFrm, (outs), (ins),
+ "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+
+
+ let Defs = [RAX], Uses = [EAX] in
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", []>; // RAX = signext(EAX)
+
+ let Defs = [RAX,RDX], Uses = [RAX] in
+ def CQO : RI<0x99, RawFrm, (outs), (ins),
+ "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let neverHasSideEffects = 1 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+ TB, OpSize, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+ TB, OpSize, Sched<[WriteALULd]>;
+} // neverHasSideEffects = 1
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+
+let neverHasSideEffects = 1 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+ TB, OpSize, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+ TB, OpSize, Sched<[WriteALULd]>;
+} // neverHasSideEffects = 1
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+ Sched<[WriteALU]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+ Sched<[WriteALULd]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALU]>;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALU]>;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+
+// FIXME: These should be Pat patterns.
+let isCodeGenOnly = 1 in {
+
+// Use movzbl instead of movzbq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALU]>;
+def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>,
+ TB, Sched<[WriteALULd]>;
+// Use movzwl instead of movzwq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALU]>;
+def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "", [(set GR64:$dst, (zextloadi64i16 addr:$src))],
+ IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+
+// There's no movzlq instruction, but movl can be used for this purpose, using
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
+def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
+ "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>,
+ Sched<[WriteALU]>;
+def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "", [(set GR64:$dst, (zextloadi64i32 addr:$src))],
+ IIC_MOVZX>, Sched<[WriteALULd]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 0000000..7759a8a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,368 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ ValueType OpVT128, ValueType OpVT256,
+ SDPatternOperator Op = null_frag> {
+ let isCommutable = 1 in
+ def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+ VR128:$src1, VR128:$src3)))]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+ (MemFrag128 addr:$src3))))]>;
+
+ let isCommutable = 1 in
+ def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+ VR256:$src3)))]>, VEX_L;
+
+ let mayLoad = 1 in
+ def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (Op VR256:$src2, VR256:$src1,
+ (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpcodeStr, string PackTy,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+ defm r213 : fma3p_rm<opc213,
+ !strconcat(OpcodeStr, "213", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+let neverHasSideEffects = 1 in {
+ defm r132 : fma3p_rm<opc132,
+ !strconcat(OpcodeStr, "132", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ defm r231 : fma3p_rm<opc231,
+ !strconcat(OpcodeStr, "231", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+} // neverHasSideEffects = 1
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
+ memopv8f32, X86Fmadd, v4f32, v8f32>;
+ defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
+ memopv8f32, X86Fmsub, v4f32, v8f32>;
+ defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+ memopv4f32, memopv8f32, X86Fmaddsub,
+ v4f32, v8f32>;
+ defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+ memopv4f32, memopv8f32, X86Fmsubadd,
+ v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
+ memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+ defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
+ memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+ defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+ memopv2f64, memopv4f64, X86Fmaddsub,
+ v2f64, v4f64>, VEX_W;
+ defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+ memopv2f64, memopv4f64, X86Fmsubadd,
+ v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32,
+ memopv8f32, X86Fnmadd, v4f32, v8f32>;
+ defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32,
+ memopv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
+ memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+ defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+ memopv2f64, memopv4f64, X86Fnmsub, v2f64,
+ v4f64>, VEX_W;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
+ RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+ SDPatternOperator OpNode = null_frag> {
+ let isCommutable = 1 in
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src2, RC:$src1,
+ (mem_frag addr:$src3))))]>;
+}
+
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
+ ComplexPattern mem_cpat, Intrinsic IntId,
+ RegisterClass RC> {
+ let isCommutable = 1 in
+ def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1,
+ VR128:$src3))]>;
+ def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy, Intrinsic Int,
+ SDNode OpNode, RegisterClass RC, ValueType OpVT,
+ X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
+ ComplexPattern mem_cpat> {
+let neverHasSideEffects = 1 in {
+ defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+ x86memop, RC, OpVT, mem_frag>;
+ defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+ x86memop, RC, OpVT, mem_frag>;
+}
+
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+ x86memop, RC, OpVT, mem_frag, OpNode>,
+ fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+ memop, mem_cpat, Int, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+ SDNode OpNode> {
+ defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", IntF32, OpNode,
+ FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
+ defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", IntF64, OpNode,
+ FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+ int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+ int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+ int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+ int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+ PatFrag mem_frag> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+ def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+ def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+ ComplexPattern mem_cpat, Intrinsic Int> {
+ let isCommutable = 1 in
+ def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+ def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+ mem_cpat:$src3))]>, VEX_W, MemOp4;
+ def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, memop:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT128, ValueType OpVT256,
+ PatFrag ld_frag128, PatFrag ld_frag256> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+ VEX_W, MemOp4;
+ def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+ (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
+ def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+ let isCommutable = 1 in
+ def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+ VEX_W, MemOp4, VEX_L;
+ def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+ (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
+ def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+ def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_L;
+} // isCodeGenOnly = 1
+}
+
+defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+ fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+ fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+ fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+ fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+ X86Fnmadd, loadf32>,
+ fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+ X86Fnmadd, loadf64>,
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+ X86Fnmsub, loadf32>,
+ fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+ X86Fnmsub, loadf64>,
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmsub_sd>;
+
+let ExeDomain = SSEPackedSingle in {
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+ memopv4f32, memopv8f32>;
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+ memopv4f32, memopv8f32>;
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+ memopv4f32, memopv8f32>;
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+ memopv4f32, memopv8f32>;
+ defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+ memopv4f32, memopv8f32>;
+ defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+ memopv4f32, memopv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+ memopv2f64, memopv4f64>;
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+ memopv2f64, memopv4f64>;
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+ memopv2f64, memopv4f64>;
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+ memopv2f64, memopv4f64>;
+ defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+ memopv2f64, memopv4f64>;
+ defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+ memopv2f64, memopv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..2224a08
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,692 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+ SDTCisVT<1, f80>]>;
+def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+ SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in { // Expanded after instruction selection.
+ def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+ def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+ [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+ [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here. The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values. These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information. These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instruction for FP stack return values.
+def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+ [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpIf32<(outs RFP32:$dst),
+ (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m : FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+ !strconcat("f", asmstring, "{s}\t$src")> {
+ let mayLoad = 1;
+}
+def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+ !strconcat("f", asmstring, "{l}\t$src")> {
+ let mayLoad = 1;
+}
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+ !strconcat("fi", asmstring, "{s}\t$src")> {
+ let mayLoad = 1;
+}
+def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+ !strconcat("fi", asmstring, "{l}\t$src")> {
+ let mayLoad = 1;
+}
+}
+
+let Defs = [FPSW] in {
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+}
+
+class FPST0rInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">;
+def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">;
+def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">;
+def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
+def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
+def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">;
+def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">;
+def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">;
+def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
+def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
+
+def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">;
+def COMP_FST0r : FPST0rInst <0xD8, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand. Added for the
+// disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
+def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+ CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc, EFLAGS))]>;
+ def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+ CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc, EFLAGS))]>;
+ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+ CondMovFP,
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+ cc, EFLAGS))]>,
+ Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA;
+def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmove\t{$op, %st(0)|ST(0), $op}">, DA;
+def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+ def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+ [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+ IIC_FLD>;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+ IIC_FLD>;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+ IIC_FLD80>;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+ IIC_FILD>;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+ IIC_FILD>;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+ IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+ IIC_FST>;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+ IIC_FST>;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+ IIC_FST>;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+ IIC_FST>;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+ IIC_FST80>;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+ IIC_FIST>;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+ IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+ IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+ IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+ IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+ IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+ IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
+ "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
+ IIC_FLD>, D9;
+def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
+ IIC_FST>, DD;
+def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
+ IIC_FST>, DD;
+def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
+ IIC_FXCH>, D9;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg),
+ "fucom\t$reg", IIC_FUCOM>, DD;
+def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg),
+ "fucomp\t$reg", IIC_FUCOM>, DD;
+def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop
+ (outs), (ins),
+ "fucompp", IIC_FUCOM>, DA;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg),
+ "fucomi\t$reg", IIC_FUCOMI>, DB;
+def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg),
+ "fucompi\t$reg", IIC_FUCOMI>, DF;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+ "fcomi\t$reg", IIC_FCOMI>, DB;
+def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+ "fcompi\t$reg", IIC_FCOMI>, DF;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags
+ (outs), (ins), "fnstsw %ax",
+ [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
+
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+ [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+ Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
+def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
+ "ffree\t$reg", IIC_FFREE>, DD;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE;
+
+def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+ "fxsave\t$dst", [], IIC_FXSAVE>, TB;
+def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+ "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W,
+ Requires<[In64BitMode]>;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
+def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
+ RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+ RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+ Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack. We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+ Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..a71e024
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,660 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+ bits<6> Value = val;
+}
+
+def Pseudo : Format<0>; def RawFrm : Format<1>;
+def AddRegFrm : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>;
+def MRMSrcMem : Format<6>;
+def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>;
+def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>;
+def MRM6r : Format<22>; def MRM7r : Format<23>;
+def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>;
+def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>;
+def MRM6m : Format<30>; def MRM7m : Format<31>;
+def MRMInitReg : Format<32>;
+def MRM_C1 : Format<33>;
+def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>;
+def MRM_C4 : Format<36>;
+def MRM_C8 : Format<37>;
+def MRM_C9 : Format<38>;
+def MRM_CA : Format<39>;
+def MRM_CB : Format<40>;
+def MRM_E8 : Format<41>;
+def MRM_F0 : Format<42>;
+def RawFrmImm8 : Format<43>;
+def RawFrmImm16 : Format<44>;
+def MRM_F8 : Format<45>;
+def MRM_F9 : Format<46>;
+def MRM_D0 : Format<47>;
+def MRM_D1 : Format<48>;
+def MRM_D4 : Format<49>;
+def MRM_D5 : Format<50>;
+def MRM_D6 : Format<51>;
+def MRM_D8 : Format<52>;
+def MRM_D9 : Format<53>;
+def MRM_DA : Format<54>;
+def MRM_DB : Format<55>;
+def MRM_DC : Format<56>;
+def MRM_DD : Format<57>;
+def MRM_DE : Format<58>;
+def MRM_DF : Format<59>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm8PCRel : ImmType<2>;
+def Imm16 : ImmType<3>;
+def Imm16PCRel : ImmType<4>;
+def Imm32 : ImmType<5>;
+def Imm32PCRel : ImmType<6>;
+def Imm64 : ImmType<7>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+ bits<2> Value = val;
+}
+def GenericDomain : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt : Domain<3>;
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class LOCK { bit hasLockPrefix = 1; }
+class SegFS { bits<2> SegOvrBits = 1; }
+class SegGS { bits<2> SegOvrBits = 2; }
+class TB { bits<5> Prefix = 1; }
+class REP { bits<5> Prefix = 2; }
+class D8 { bits<5> Prefix = 3; }
+class D9 { bits<5> Prefix = 4; }
+class DA { bits<5> Prefix = 5; }
+class DB { bits<5> Prefix = 6; }
+class DC { bits<5> Prefix = 7; }
+class DD { bits<5> Prefix = 8; }
+class DE { bits<5> Prefix = 9; }
+class DF { bits<5> Prefix = 10; }
+class XD { bits<5> Prefix = 11; }
+class XS { bits<5> Prefix = 12; }
+class T8 { bits<5> Prefix = 13; }
+class TA { bits<5> Prefix = 14; }
+class A6 { bits<5> Prefix = 15; }
+class A7 { bits<5> Prefix = 16; }
+class T8XD { bits<5> Prefix = 17; }
+class T8XS { bits<5> Prefix = 18; }
+class TAXD { bits<5> Prefix = 19; }
+class XOP8 { bits<5> Prefix = 20; }
+class XOP9 { bits<5> Prefix = 21; }
+class VEX { bit hasVEXPrefix = 1; }
+class VEX_W { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
+class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; }
+class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
+class VEX_L { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
+class MemOp4 { bit hasMemOp4Prefix = 1; }
+class XOP { bit hasXOP_Prefix = 1; }
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+ string AsmStr,
+ InstrItinClass itin,
+ Domain d = GenericDomain>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<6> FormBits = Form.Value;
+ ImmType ImmT = i;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ string AsmString = AsmStr;
+
+ // If this is a pseudo instruction, mark it isCodeGenOnly.
+ let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+ let Itinerary = itin;
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix?
+ bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix?
+
+ bits<5> Prefix = 0; // Which prefix byte does this inst have?
+ bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix?
+ FPFormat FPForm = NotFP; // What flavor of FP instruction is this?
+ bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
+ bits<2> SegOvrBits = 0; // Segment override prefix.
+ Domain ExeDomain = d;
+ bit hasVEXPrefix = 0; // Does this inst require a VEX prefix?
+ bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field?
+ bit hasVEX_4VPrefix = 0; // Does this inst require the VEX.VVVV field?
+ bit hasVEX_4VOp3Prefix = 0; // Does this inst require the VEX.VVVV field to
+ // encode the third operand?
+ bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register
+ // to be encoded in a immediate field?
+ bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
+ bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
+ bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+ bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands
+ bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix?
+
+ // TSFlags layout should be kept in sync with X86InstrInfo.h.
+ let TSFlags{5-0} = FormBits;
+ let TSFlags{6} = hasOpSizePrefix;
+ let TSFlags{7} = hasAdSizePrefix;
+ let TSFlags{12-8} = Prefix;
+ let TSFlags{13} = hasREX_WPrefix;
+ let TSFlags{16-14} = ImmT.Value;
+ let TSFlags{19-17} = FPForm.Value;
+ let TSFlags{20} = hasLockPrefix;
+ let TSFlags{22-21} = SegOvrBits;
+ let TSFlags{24-23} = ExeDomain.Value;
+ let TSFlags{32-25} = Opcode;
+ let TSFlags{33} = hasVEXPrefix;
+ let TSFlags{34} = hasVEX_WPrefix;
+ let TSFlags{35} = hasVEX_4VPrefix;
+ let TSFlags{36} = hasVEX_4VOp3Prefix;
+ let TSFlags{37} = hasVEX_i8ImmReg;
+ let TSFlags{38} = hasVEX_L;
+ let TSFlags{39} = ignoresVEX_L;
+ let TSFlags{40} = has3DNow0F0FOpcode;
+ let TSFlags{41} = hasMemOp4Prefix;
+ let TSFlags{42} = hasXOP_Prefix;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+ let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin = NoItinerary>
+ : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+ let FPForm = fp;
+ let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+// their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+// Iseg16 - 16-bit segment selector, 16-bit offset
+// Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+def __xs : XS;
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin> {
+ let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+ !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// SIi8 - SSE 1 & 2 scalar instructions
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin> {
+ let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+ !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+ !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+ let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
+ !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
+}
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with TB prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+// VSSI - SSE1 instructions with XS prefix in AVX form.
+// VPSI - SSE1 instructions with TB prefix in AVX form.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+ Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+ Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB,
+ Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// S2SI - SSE2 instructions with XS prefix.
+// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+// PDI - SSE2 instructions with TB and OpSize prefixes.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+// VSDI - SSE2 instructions with XD prefix in AVX form.
+// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form.
+// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+// MMX operands.
+// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+// MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+ Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+ Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+ Requires<[HasAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+ Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
+ OpSize, Requires<[HasAVX]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with TB and OpSize prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+ Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+ Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+ Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 prefix.
+// SS3AI - SSSE3 instructions with TA prefix.
+// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+ Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+ Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+// SS48I - SSE 4.1 instructions with T8 prefix.
+// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+ Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+// SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+ Requires<[UseSSE42]>;
+
+// SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+// SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+// Instructions introduced in AVX (no SSE equivalent forms)
+//
+// AVX8I - AVX instructions with T8 and OpSize prefix.
+// AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+ Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+ Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+// Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+// AVX28I - AVX2 instructions with T8 and OpSize prefix.
+// AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+ Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+ Requires<[HasAVX2]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+ Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ OpSize, Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8,
+ OpSize, VEX_4V, FMASC, Requires<[HasFMA]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, TA,
+ OpSize, VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP, XOP9, Requires<[HasXOP]>;
+
+// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP, XOP8, Requires<[HasXOP]>;
+
+// XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI - MMX instructions with TB prefix.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 0000000..2a72fb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,432 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+ SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+ SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86umin : SDNode<"X86ISD::UMIN", SDTIntBinOp>;
+def X86umax : SDNode<"X86ISD::UMAX", SDTIntBinOp>;
+def X86smin : SDNode<"X86ISD::SMIN", SDTIntBinOp>;
+def X86smax : SDNode<"X86ISD::SMAX", SDTIntBinOp>;
+
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
+def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
+def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
+def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
+def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>;
+def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>;
+def X86pshufb : SDNode<"X86ISD::PSHUFB",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86andnp : SDNode<"X86ISD::ANDNP",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86psign : SDNode<"X86ISD::PSIGN",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86pextrb : SDNode<"X86ISD::PEXTRB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb : SDNode<"X86ISD::PINSRB",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzmovly : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisOpSmallerThanOp<1, 0> ]>>;
+
+def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL",
+ SDTypeProfile<1, 1,
+ [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
+
+def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext : SDNode<"X86ISD::VZEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vsext : SDNode<"X86ISD::VSEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vfpext : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisFP<1>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisFP<1>]>>;
+
+def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
+def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>;
+def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
+def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86vshl : SDNode<"X86ISD::VSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsrl : SDNode<"X86ISD::VSRL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsra : SDNode<"X86ISD::VSRA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+
+def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisVec<1>,
+ SDTCisSameAs<2, 1>]>;
+def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+
+def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+
+def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
+def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;
+def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+
+def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
+def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
+def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>;
+def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;
+def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>;
+def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+ SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+ SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+ SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements. These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+ SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+ SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+ let PrintMethod = "printf32mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+ let PrintMethod = "printf64mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 128-/256-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'X86vzload', but always requires 128-bit vector alignment.
+def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
+ return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+ (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+ (f64 (alignedload node:$ptr))>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+ (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+ (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+ (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+ (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+ (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+ (v4i64 (alignedload256 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others. If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return Subtarget->hasVectorUAMem()
+ || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// 256-bit memop pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
+def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
+def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
+
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal();
+ return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+ ST->getAddressingMode() == ISD::UNINDEXED &&
+ ST->getAlignment() >= 16;
+ return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal() &&
+ ST->getAlignment() < 16;
+ return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm : SDNodeXForm<imm, [{
+ // Transformation function: imm >> 3
+ return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128 imm.
+def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
+}]>;
+
+// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
+// VINSERTF128 imm.
+def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
+}]>;
+
+def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACTF128Index(N);
+}], EXTRACT_get_vextractf128_imm>;
+
+def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERTF128Index(N);
+}], INSERT_get_vinsertf128_imm>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..7c0423f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,4896 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <limits>
+
+#define GET_INSTRINFO_CTOR
+#include "X86GenInstrInfo.inc"
+
+using namespace llvm;
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+ cl::desc("Re-materialize load from stub in PIC mode"),
+ cl::init(false), cl::Hidden);
+
+enum {
+ // Select which memory operand is being unfolded.
+ // (stored in bits 0 - 3)
+ TB_INDEX_0 = 0,
+ TB_INDEX_1 = 1,
+ TB_INDEX_2 = 2,
+ TB_INDEX_3 = 3,
+ TB_INDEX_MASK = 0xf,
+
+ // Do not insert the reverse map (MemOp -> RegOp) into the table.
+ // This may be needed because there is a many -> one mapping.
+ TB_NO_REVERSE = 1 << 4,
+
+ // Do not insert the forward map (RegOp -> MemOp) into the table.
+ // This is needed for Native Client, which prohibits branch
+ // instructions from using a memory operand.
+ TB_NO_FORWARD = 1 << 5,
+
+ TB_FOLDED_LOAD = 1 << 6,
+ TB_FOLDED_STORE = 1 << 7,
+
+ // Minimum alignment required for load/store.
+ // Used for RegOp->MemOp conversion.
+ // (stored in bits 8 - 15)
+ TB_ALIGN_SHIFT = 8,
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86OpTblEntry {
+ uint16_t RegOp;
+ uint16_t MemOp;
+ uint16_t Flags;
+};
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+ : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::ADJCALLSTACKDOWN64
+ : X86::ADJCALLSTACKDOWN32),
+ (tm.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::ADJCALLSTACKUP64
+ : X86::ADJCALLSTACKUP32)),
+ TM(tm), RI(tm, *this) {
+
+ static const X86OpTblEntry OpTbl2Addr[] = {
+ { X86::ADC32ri, X86::ADC32mi, 0 },
+ { X86::ADC32ri8, X86::ADC32mi8, 0 },
+ { X86::ADC32rr, X86::ADC32mr, 0 },
+ { X86::ADC64ri32, X86::ADC64mi32, 0 },
+ { X86::ADC64ri8, X86::ADC64mi8, 0 },
+ { X86::ADC64rr, X86::ADC64mr, 0 },
+ { X86::ADD16ri, X86::ADD16mi, 0 },
+ { X86::ADD16ri8, X86::ADD16mi8, 0 },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16rr, X86::ADD16mr, 0 },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri, X86::ADD32mi, 0 },
+ { X86::ADD32ri8, X86::ADD32mi8, 0 },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32mr, 0 },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32, X86::ADD64mi32, 0 },
+ { X86::ADD64ri8, X86::ADD64mi8, 0 },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64mr, 0 },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri, X86::ADD8mi, 0 },
+ { X86::ADD8rr, X86::ADD8mr, 0 },
+ { X86::AND16ri, X86::AND16mi, 0 },
+ { X86::AND16ri8, X86::AND16mi8, 0 },
+ { X86::AND16rr, X86::AND16mr, 0 },
+ { X86::AND32ri, X86::AND32mi, 0 },
+ { X86::AND32ri8, X86::AND32mi8, 0 },
+ { X86::AND32rr, X86::AND32mr, 0 },
+ { X86::AND64ri32, X86::AND64mi32, 0 },
+ { X86::AND64ri8, X86::AND64mi8, 0 },
+ { X86::AND64rr, X86::AND64mr, 0 },
+ { X86::AND8ri, X86::AND8mi, 0 },
+ { X86::AND8rr, X86::AND8mr, 0 },
+ { X86::DEC16r, X86::DEC16m, 0 },
+ { X86::DEC32r, X86::DEC32m, 0 },
+ { X86::DEC64_16r, X86::DEC64_16m, 0 },
+ { X86::DEC64_32r, X86::DEC64_32m, 0 },
+ { X86::DEC64r, X86::DEC64m, 0 },
+ { X86::DEC8r, X86::DEC8m, 0 },
+ { X86::INC16r, X86::INC16m, 0 },
+ { X86::INC32r, X86::INC32m, 0 },
+ { X86::INC64_16r, X86::INC64_16m, 0 },
+ { X86::INC64_32r, X86::INC64_32m, 0 },
+ { X86::INC64r, X86::INC64m, 0 },
+ { X86::INC8r, X86::INC8m, 0 },
+ { X86::NEG16r, X86::NEG16m, 0 },
+ { X86::NEG32r, X86::NEG32m, 0 },
+ { X86::NEG64r, X86::NEG64m, 0 },
+ { X86::NEG8r, X86::NEG8m, 0 },
+ { X86::NOT16r, X86::NOT16m, 0 },
+ { X86::NOT32r, X86::NOT32m, 0 },
+ { X86::NOT64r, X86::NOT64m, 0 },
+ { X86::NOT8r, X86::NOT8m, 0 },
+ { X86::OR16ri, X86::OR16mi, 0 },
+ { X86::OR16ri8, X86::OR16mi8, 0 },
+ { X86::OR16rr, X86::OR16mr, 0 },
+ { X86::OR32ri, X86::OR32mi, 0 },
+ { X86::OR32ri8, X86::OR32mi8, 0 },
+ { X86::OR32rr, X86::OR32mr, 0 },
+ { X86::OR64ri32, X86::OR64mi32, 0 },
+ { X86::OR64ri8, X86::OR64mi8, 0 },
+ { X86::OR64rr, X86::OR64mr, 0 },
+ { X86::OR8ri, X86::OR8mi, 0 },
+ { X86::OR8rr, X86::OR8mr, 0 },
+ { X86::ROL16r1, X86::ROL16m1, 0 },
+ { X86::ROL16rCL, X86::ROL16mCL, 0 },
+ { X86::ROL16ri, X86::ROL16mi, 0 },
+ { X86::ROL32r1, X86::ROL32m1, 0 },
+ { X86::ROL32rCL, X86::ROL32mCL, 0 },
+ { X86::ROL32ri, X86::ROL32mi, 0 },
+ { X86::ROL64r1, X86::ROL64m1, 0 },
+ { X86::ROL64rCL, X86::ROL64mCL, 0 },
+ { X86::ROL64ri, X86::ROL64mi, 0 },
+ { X86::ROL8r1, X86::ROL8m1, 0 },
+ { X86::ROL8rCL, X86::ROL8mCL, 0 },
+ { X86::ROL8ri, X86::ROL8mi, 0 },
+ { X86::ROR16r1, X86::ROR16m1, 0 },
+ { X86::ROR16rCL, X86::ROR16mCL, 0 },
+ { X86::ROR16ri, X86::ROR16mi, 0 },
+ { X86::ROR32r1, X86::ROR32m1, 0 },
+ { X86::ROR32rCL, X86::ROR32mCL, 0 },
+ { X86::ROR32ri, X86::ROR32mi, 0 },
+ { X86::ROR64r1, X86::ROR64m1, 0 },
+ { X86::ROR64rCL, X86::ROR64mCL, 0 },
+ { X86::ROR64ri, X86::ROR64mi, 0 },
+ { X86::ROR8r1, X86::ROR8m1, 0 },
+ { X86::ROR8rCL, X86::ROR8mCL, 0 },
+ { X86::ROR8ri, X86::ROR8mi, 0 },
+ { X86::SAR16r1, X86::SAR16m1, 0 },
+ { X86::SAR16rCL, X86::SAR16mCL, 0 },
+ { X86::SAR16ri, X86::SAR16mi, 0 },
+ { X86::SAR32r1, X86::SAR32m1, 0 },
+ { X86::SAR32rCL, X86::SAR32mCL, 0 },
+ { X86::SAR32ri, X86::SAR32mi, 0 },
+ { X86::SAR64r1, X86::SAR64m1, 0 },
+ { X86::SAR64rCL, X86::SAR64mCL, 0 },
+ { X86::SAR64ri, X86::SAR64mi, 0 },
+ { X86::SAR8r1, X86::SAR8m1, 0 },
+ { X86::SAR8rCL, X86::SAR8mCL, 0 },
+ { X86::SAR8ri, X86::SAR8mi, 0 },
+ { X86::SBB32ri, X86::SBB32mi, 0 },
+ { X86::SBB32ri8, X86::SBB32mi8, 0 },
+ { X86::SBB32rr, X86::SBB32mr, 0 },
+ { X86::SBB64ri32, X86::SBB64mi32, 0 },
+ { X86::SBB64ri8, X86::SBB64mi8, 0 },
+ { X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SHL16rCL, X86::SHL16mCL, 0 },
+ { X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32rCL, X86::SHL32mCL, 0 },
+ { X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64rCL, X86::SHL64mCL, 0 },
+ { X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8rCL, X86::SHL8mCL, 0 },
+ { X86::SHL8ri, X86::SHL8mi, 0 },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
+ { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
+ { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
+ { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
+ { X86::SHR16r1, X86::SHR16m1, 0 },
+ { X86::SHR16rCL, X86::SHR16mCL, 0 },
+ { X86::SHR16ri, X86::SHR16mi, 0 },
+ { X86::SHR32r1, X86::SHR32m1, 0 },
+ { X86::SHR32rCL, X86::SHR32mCL, 0 },
+ { X86::SHR32ri, X86::SHR32mi, 0 },
+ { X86::SHR64r1, X86::SHR64m1, 0 },
+ { X86::SHR64rCL, X86::SHR64mCL, 0 },
+ { X86::SHR64ri, X86::SHR64mi, 0 },
+ { X86::SHR8r1, X86::SHR8m1, 0 },
+ { X86::SHR8rCL, X86::SHR8mCL, 0 },
+ { X86::SHR8ri, X86::SHR8mi, 0 },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
+ { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
+ { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
+ { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
+ { X86::SUB16ri, X86::SUB16mi, 0 },
+ { X86::SUB16ri8, X86::SUB16mi8, 0 },
+ { X86::SUB16rr, X86::SUB16mr, 0 },
+ { X86::SUB32ri, X86::SUB32mi, 0 },
+ { X86::SUB32ri8, X86::SUB32mi8, 0 },
+ { X86::SUB32rr, X86::SUB32mr, 0 },
+ { X86::SUB64ri32, X86::SUB64mi32, 0 },
+ { X86::SUB64ri8, X86::SUB64mi8, 0 },
+ { X86::SUB64rr, X86::SUB64mr, 0 },
+ { X86::SUB8ri, X86::SUB8mi, 0 },
+ { X86::SUB8rr, X86::SUB8mr, 0 },
+ { X86::XOR16ri, X86::XOR16mi, 0 },
+ { X86::XOR16ri8, X86::XOR16mi8, 0 },
+ { X86::XOR16rr, X86::XOR16mr, 0 },
+ { X86::XOR32ri, X86::XOR32mi, 0 },
+ { X86::XOR32ri8, X86::XOR32mi8, 0 },
+ { X86::XOR32rr, X86::XOR32mr, 0 },
+ { X86::XOR64ri32, X86::XOR64mi32, 0 },
+ { X86::XOR64ri8, X86::XOR64mi8, 0 },
+ { X86::XOR64rr, X86::XOR64mr, 0 },
+ { X86::XOR8ri, X86::XOR8mi, 0 },
+ { X86::XOR8rr, X86::XOR8mr, 0 }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+ unsigned RegOp = OpTbl2Addr[i].RegOp;
+ unsigned MemOp = OpTbl2Addr[i].MemOp;
+ unsigned Flags = OpTbl2Addr[i].Flags;
+ AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+ RegOp, MemOp,
+ // Index 0, folded load and store, no alignment requirement.
+ Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+ }
+
+ static const X86OpTblEntry OpTbl0[] = {
+ { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
+ { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
+ { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
+ { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
+ { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
+ { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
+ { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
+ { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
+ { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
+ { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
+ { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
+ { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
+ { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
+ { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
+ { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
+ { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
+ { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
+ { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
+ { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
+ { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::FsMOVAPDrr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::FsMOVAPSrr, X86::MOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
+ { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
+ { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
+ { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
+ { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
+ { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
+ { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
+ { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
+ { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
+ { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
+ { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
+ { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
+ { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
+ { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
+ { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
+ { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
+ { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
+ { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+ { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
+ { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
+ { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
+ { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
+ { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
+ { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
+ { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
+ { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
+ { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
+ { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
+ { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
+ { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
+ { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
+ { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
+ { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
+ { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
+ { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
+ { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
+ { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
+ { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
+ { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
+ { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
+ { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
+ { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
+ { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
+ { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
+ { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
+ { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
+ // AVX 128-bit versions of foldable instructions
+ { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+ { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
+ { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
+ // AVX 256-bit foldable instructions
+ { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+ unsigned RegOp = OpTbl0[i].RegOp;
+ unsigned MemOp = OpTbl0[i].MemOp;
+ unsigned Flags = OpTbl0[i].Flags;
+ AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+ RegOp, MemOp, TB_INDEX_0 | Flags);
+ }
+
+ static const X86OpTblEntry OpTbl1[] = {
+ { X86::CMP16rr, X86::CMP16rm, 0 },
+ { X86::CMP32rr, X86::CMP32rm, 0 },
+ { X86::CMP64rr, X86::CMP64rm, 0 },
+ { X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
+ { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
+ { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
+ { X86::FsMOVAPDrr, X86::MOVSDrm, TB_NO_REVERSE },
+ { X86::FsMOVAPSrr, X86::MOVSSrm, TB_NO_REVERSE },
+ { X86::IMUL16rri, X86::IMUL16rmi, 0 },
+ { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
+ { X86::IMUL32rri, X86::IMUL32rmi, 0 },
+ { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
+ { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 },
+ { X86::MOV16rr, X86::MOV16rm, 0 },
+ { X86::MOV32rr, X86::MOV32rm, 0 },
+ { X86::MOV64rr, X86::MOV64rm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV8rr, X86::MOV8rm, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
+ { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 },
+ { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
+ { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
+ { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 },
+ { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 },
+ { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 },
+ { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
+ { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
+ { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
+ { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
+ { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
+ { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
+ { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
+ { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 },
+ { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
+ { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 },
+ { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
+ { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
+ { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
+ { X86::SQRTSDr, X86::SQRTSDm, 0 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
+ { X86::SQRTSSr, X86::SQRTSSm, 0 },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 },
+ { X86::TEST16rr, X86::TEST16rm, 0 },
+ { X86::TEST32rr, X86::TEST32rm, 0 },
+ { X86::TEST64rr, X86::TEST64rm, 0 },
+ { X86::TEST8rr, X86::TEST8rm, 0 },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
+ { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
+ // AVX 128-bit versions of foldable instructions
+ { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
+ { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
+ { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 },
+ { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 },
+ { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
+ { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
+ { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 },
+ { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
+ { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
+ { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
+ { X86::FsVMOVAPDrr, X86::VMOVSDrm, TB_NO_REVERSE },
+ { X86::FsVMOVAPSrr, X86::VMOVSSrm, TB_NO_REVERSE },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
+ { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
+ { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
+ { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
+ { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 },
+ { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 },
+ { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
+ { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
+ { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 },
+ { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },
+ { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
+ { X86::VPABSBrr128, X86::VPABSBrm128, 0 },
+ { X86::VPABSDrr128, X86::VPABSDrm128, 0 },
+ { X86::VPABSWrr128, X86::VPABSWrm128, 0 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
+ { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
+ { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VRCPPSr, X86::VRCPPSm, 0 },
+ { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },
+ { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
+ { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 },
+ { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
+ { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
+ { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
+
+ // AVX 256-bit foldable instructions
+ { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
+
+ // AVX2 foldable instructions
+ { X86::VPABSBrr256, X86::VPABSBrm256, 0 },
+ { X86::VPABSDrr256, X86::VPABSDrm256, 0 },
+ { X86::VPABSWrr256, X86::VPABSWrm256, 0 },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
+ { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
+
+ // BMI/BMI2/LZCNT/POPCNT foldable instructions
+ { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
+ { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
+ { X86::BLSI32rr, X86::BLSI32rm, 0 },
+ { X86::BLSI64rr, X86::BLSI64rm, 0 },
+ { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
+ { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
+ { X86::BLSR32rr, X86::BLSR32rm, 0 },
+ { X86::BLSR64rr, X86::BLSR64rm, 0 },
+ { X86::BZHI32rr, X86::BZHI32rm, 0 },
+ { X86::BZHI64rr, X86::BZHI64rm, 0 },
+ { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
+ { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
+ { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
+ { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
+ { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
+ { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
+ { X86::RORX32ri, X86::RORX32mi, 0 },
+ { X86::RORX64ri, X86::RORX64mi, 0 },
+ { X86::SARX32rr, X86::SARX32rm, 0 },
+ { X86::SARX64rr, X86::SARX64rm, 0 },
+ { X86::SHRX32rr, X86::SHRX32rm, 0 },
+ { X86::SHRX64rr, X86::SHRX64rm, 0 },
+ { X86::SHLX32rr, X86::SHLX32rm, 0 },
+ { X86::SHLX64rr, X86::SHLX64rm, 0 },
+ { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
+ { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
+ { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+ unsigned RegOp = OpTbl1[i].RegOp;
+ unsigned MemOp = OpTbl1[i].MemOp;
+ unsigned Flags = OpTbl1[i].Flags;
+ AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+ RegOp, MemOp,
+ // Index 1, folded load
+ Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+ }
+
+ static const X86OpTblEntry OpTbl2[] = {
+ { X86::ADC32rr, X86::ADC32rm, 0 },
+ { X86::ADC64rr, X86::ADC64rm, 0 },
+ { X86::ADD16rr, X86::ADD16rm, 0 },
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr, X86::ADD32rm, 0 },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr, X86::ADD64rm, 0 },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr, X86::ADD8rm, 0 },
+ { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
+ { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
+ { X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
+ { X86::AND16rr, X86::AND16rm, 0 },
+ { X86::AND32rr, X86::AND32rm, 0 },
+ { X86::AND64rr, X86::AND64rm, 0 },
+ { X86::AND8rr, X86::AND8rm, 0 },
+ { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
+ { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
+ { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
+ { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
+ { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
+ { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
+ { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
+ { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
+ { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
+ { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
+ { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
+ { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
+ { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
+ { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
+ { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
+ { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
+ { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
+ { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
+ { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
+ { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
+ { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
+ { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
+ { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
+ { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
+ { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
+ { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
+ { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
+ { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
+ { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
+ { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
+ { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
+ { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
+ { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
+ { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
+ { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
+ { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
+ { X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
+ { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
+ { X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
+ { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
+ { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
+ { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
+ { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
+ { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
+ { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
+ { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
+ { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
+ { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
+ { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
+ { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
+ { X86::IMUL16rr, X86::IMUL16rm, 0 },
+ { X86::IMUL32rr, X86::IMUL32rm, 0 },
+ { X86::IMUL64rr, X86::IMUL64rm, 0 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
+ { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
+ { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
+ { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
+ { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
+ { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
+ { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
+ { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
+ { X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::OR16rr, X86::OR16rm, 0 },
+ { X86::OR32rr, X86::OR32rm, 0 },
+ { X86::OR64rr, X86::OR64rm, 0 },
+ { X86::OR8rr, X86::OR8rm, 0 },
+ { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
+ { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
+ { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
+ { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
+ { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
+ { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
+ { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
+ { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
+ { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
+ { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
+ { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
+ { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 },
+ { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
+ { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
+ { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
+ { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
+ { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
+ { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
+ { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
+ { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
+ { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
+ { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
+ { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
+ { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
+ { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 },
+ { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
+ { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
+ { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
+ { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
+ { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
+ { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
+ { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
+ { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
+ { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
+ { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
+ { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
+ { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
+ { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
+ { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
+ { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
+ { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 },
+ { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
+ { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
+ { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
+ { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
+ { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
+ { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
+ { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
+ { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
+ { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 },
+ { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 },
+ { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 },
+ { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
+ { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
+ { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
+ { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
+ { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
+ { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
+ { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
+ { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
+ { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
+ { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
+ { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
+ { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
+ { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
+ { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::SBB32rr, X86::SBB32rm, 0 },
+ { X86::SBB64rr, X86::SBB64rm, 0 },
+ { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
+ { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
+ { X86::SUB16rr, X86::SUB16rm, 0 },
+ { X86::SUB32rr, X86::SUB32rm, 0 },
+ { X86::SUB64rr, X86::SUB64rm, 0 },
+ { X86::SUB8rr, X86::SUB8rm, 0 },
+ { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
+ { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
+ { X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSSrr, X86::SUBSSrm, 0 },
+ // FIXME: TEST*rr -> swapped operand of TEST*mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
+ { X86::XOR16rr, X86::XOR16rm, 0 },
+ { X86::XOR32rr, X86::XOR32rm, 0 },
+ { X86::XOR64rr, X86::XOR64rm, 0 },
+ { X86::XOR8rr, X86::XOR8rm, 0 },
+ { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
+ { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
+ // AVX 128-bit versions of foldable instructions
+ { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
+ { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 },
+ { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
+ { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
+ { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
+ { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
+ { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
+ { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
+ { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
+ { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
+ { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
+ { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
+ { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VADDPDrr, X86::VADDPDrm, 0 },
+ { X86::VADDPSrr, X86::VADDPSrm, 0 },
+ { X86::VADDSDrr, X86::VADDSDrm, 0 },
+ { X86::VADDSSrr, X86::VADDSSrm, 0 },
+ { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
+ { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
+ { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
+ { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
+ { X86::VANDPDrr, X86::VANDPDrm, 0 },
+ { X86::VANDPSrr, X86::VANDPSrm, 0 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
+ { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
+ { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
+ { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
+ { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
+ { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
+ { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
+ { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
+ { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
+ { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 },
+ { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 },
+ { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 },
+ { X86::VFsANDPSrr, X86::VFsANDPSrm, TB_ALIGN_16 },
+ { X86::VFsORPDrr, X86::VFsORPDrm, TB_ALIGN_16 },
+ { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 },
+ { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 },
+ { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 },
+ { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
+ { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
+ { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
+ { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
+ { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 },
+ { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 },
+ { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
+ { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
+ { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
+ { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
+ { X86::VMINPDrr, X86::VMINPDrm, 0 },
+ { X86::VMINPSrr, X86::VMINPSrm, 0 },
+ { X86::VMINSDrr, X86::VMINSDrm, 0 },
+ { X86::VMINSSrr, X86::VMINSSrm, 0 },
+ { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
+ { X86::VMULPDrr, X86::VMULPDrm, 0 },
+ { X86::VMULPSrr, X86::VMULPSrm, 0 },
+ { X86::VMULSDrr, X86::VMULSDrm, 0 },
+ { X86::VMULSSrr, X86::VMULSSrm, 0 },
+ { X86::VORPDrr, X86::VORPDrm, 0 },
+ { X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
+ { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
+ { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
+ { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
+ { X86::VPADDBrr, X86::VPADDBrm, 0 },
+ { X86::VPADDDrr, X86::VPADDDrm, 0 },
+ { X86::VPADDQrr, X86::VPADDQrm, 0 },
+ { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
+ { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
+ { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
+ { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
+ { X86::VPADDWrr, X86::VPADDWrm, 0 },
+ { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 },
+ { X86::VPANDNrr, X86::VPANDNrm, 0 },
+ { X86::VPANDrr, X86::VPANDrm, 0 },
+ { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
+ { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
+ { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
+ { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
+ { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
+ { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
+ { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
+ { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
+ { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
+ { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
+ { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
+ { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
+ { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
+ { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
+ { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
+ { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
+ { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
+ { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
+ { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
+ { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
+ { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
+ { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
+ { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
+ { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
+ { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
+ { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
+ { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 },
+ { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
+ { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
+ { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
+ { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
+ { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
+ { X86::VPORrr, X86::VPORrm, 0 },
+ { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
+ { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 },
+ { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 },
+ { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 },
+ { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
+ { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
+ { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
+ { X86::VPSRADrr, X86::VPSRADrm, 0 },
+ { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
+ { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
+ { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
+ { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
+ { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
+ { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
+ { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
+ { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
+ { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
+ { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
+ { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
+ { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
+ { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
+ { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
+ { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
+ { X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
+ { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
+ { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
+ { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
+ { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
+ { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
+ { X86::VXORPDrr, X86::VXORPDrm, 0 },
+ { X86::VXORPSrr, X86::VXORPSrm, 0 },
+ // AVX 256-bit foldable instructions
+ { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
+ { X86::VORPDYrr, X86::VORPDYrm, 0 },
+ { X86::VORPSYrr, X86::VORPSYrm, 0 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
+ // AVX2 foldable instructions
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
+ { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
+ { X86::VPANDYrr, X86::VPANDYrm, 0 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
+ { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
+ { X86::VPERMQYri, X86::VPERMQYmi, 0 },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
+ { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
+ { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
+ { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
+ { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
+ { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
+ { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
+ { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
+ { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
+ { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
+ { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
+ { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
+ { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
+ { X86::VPORYrr, X86::VPORYrm, 0 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
+ { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 },
+ { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 },
+ { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 },
+ { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
+ { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
+ { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
+ { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
+ { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
+ { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
+ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
+ { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
+ { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
+ { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
+ { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
+ { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
+ { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
+ { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
+ { X86::VPXORYrr, X86::VPXORYrm, 0 },
+ // FIXME: add AVX 256-bit foldable instructions
+
+ // FMA4 foldable patterns
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 },
+
+ // BMI/BMI2 foldable instructions
+ { X86::ANDN32rr, X86::ANDN32rm, 0 },
+ { X86::ANDN64rr, X86::ANDN64rm, 0 },
+ { X86::MULX32rr, X86::MULX32rm, 0 },
+ { X86::MULX64rr, X86::MULX64rm, 0 },
+ { X86::PDEP32rr, X86::PDEP32rm, 0 },
+ { X86::PDEP64rr, X86::PDEP64rm, 0 },
+ { X86::PEXT32rr, X86::PEXT32rm, 0 },
+ { X86::PEXT64rr, X86::PEXT64rm, 0 },
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+ unsigned RegOp = OpTbl2[i].RegOp;
+ unsigned MemOp = OpTbl2[i].MemOp;
+ unsigned Flags = OpTbl2[i].Flags;
+ AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+ RegOp, MemOp,
+ // Index 2, folded load
+ Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+ }
+
+ static const X86OpTblEntry OpTbl3[] = {
+ // FMA foldable instructions
+ { X86::VFMADDSSr231r, X86::VFMADDSSr231m, 0 },
+ { X86::VFMADDSDr231r, X86::VFMADDSDr231m, 0 },
+ { X86::VFMADDSSr132r, X86::VFMADDSSr132m, 0 },
+ { X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 },
+ { X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 },
+ { X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 },
+ { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, 0 },
+ { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, 0 },
+
+ { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 },
+ { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 },
+ { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_16 },
+ { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_16 },
+ { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_16 },
+ { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_16 },
+ { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_32 },
+ { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_32 },
+ { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_32 },
+ { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 },
+ { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 },
+ { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 },
+
+ { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 },
+ { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 },
+ { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, 0 },
+ { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 },
+ { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 },
+ { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 },
+ { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, 0 },
+ { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, 0 },
+
+ { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 },
+ { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 },
+ { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_16 },
+ { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_16 },
+ { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_16 },
+ { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_16 },
+ { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_32 },
+ { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_32 },
+ { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_32 },
+ { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 },
+ { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 },
+ { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 },
+
+ { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 },
+ { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 },
+ { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, 0 },
+ { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 },
+ { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 },
+ { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 },
+ { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, 0 },
+ { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, 0 },
+
+ { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 },
+ { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 },
+ { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_16 },
+ { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_16 },
+ { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_16 },
+ { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_16 },
+ { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 },
+ { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 },
+
+ { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 },
+ { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 },
+ { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, 0 },
+ { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 },
+ { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 },
+ { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 },
+ { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, 0 },
+ { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, 0 },
+
+ { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 },
+
+ { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 },
+
+ { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 },
+
+ // FMA4 foldable patterns
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 },
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
+ unsigned RegOp = OpTbl3[i].RegOp;
+ unsigned MemOp = OpTbl3[i].MemOp;
+ unsigned Flags = OpTbl3[i].Flags;
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ RegOp, MemOp,
+ // Index 3, folded load
+ Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ unsigned RegOp, unsigned MemOp, unsigned Flags) {
+ if ((Flags & TB_NO_FORWARD) == 0) {
+ assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+ R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+ }
+ if ((Flags & TB_NO_REVERSE) == 0) {
+ assert(!M2RTable.count(MemOp) &&
+ "Duplicated entries in unfolding maps?");
+ M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+ }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ case X86::MOVZX64rr8:
+ if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+ // It's not always legal to reference the low 8-bit of the larger
+ // register in 32-bit mode.
+ return false;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVZX64rr16:
+ case X86::MOVSX64rr32:
+ case X86::MOVZX64rr32: {
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ // Be conservative.
+ return false;
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ case X86::MOVZX64rr8:
+ SubIdx = X86::sub_8bit;
+ break;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVZX64rr16:
+ SubIdx = X86::sub_16bit;
+ break;
+ case X86::MOVSX64rr32:
+ case X86::MOVZX64rr32:
+ SubIdx = X86::sub_32bit;
+ break;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+/// isFrameOperand - Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+ int &FrameIndex) const {
+ if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
+ MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
+ MI->getOperand(Op+1).getImm() == 1 &&
+ MI->getOperand(Op+2).getReg() == 0 &&
+ MI->getOperand(Op+3).getImm() == 0) {
+ FrameIndex = MI->getOperand(Op).getIndex();
+ return true;
+ }
+ return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return true;
+ }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+ switch (Opcode) {
+ default: break;
+ case X86::MOV8mr:
+ case X86::MOV16mr:
+ case X86::MOV32mr:
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSSmr:
+ case X86::MOVSDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDmr:
+ case X86::MOVDQAmr:
+ case X86::VMOVSSmr:
+ case X86::VMOVSDmr:
+ case X86::VMOVAPSmr:
+ case X86::VMOVAPDmr:
+ case X86::VMOVDQAmr:
+ case X86::VMOVAPSYmr:
+ case X86::VMOVAPDYmr:
+ case X86::VMOVDQAYmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ return true;
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI->getOpcode()))
+ if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+ return MI->getOperand(0).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI->getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI->getOpcode()))
+ if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+ isFrameOperand(MI, 0, FrameIndex))
+ return MI->getOperand(X86::AddrNumOperands).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI->getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+ // Don't waste compile time scanning use-def chains of physregs.
+ if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+ return false;
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+ E = MRI.def_end(); I != E; ++I) {
+ MachineInstr *DefMI = I.getOperand().getParent();
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+ AliasAnalysis *AA) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsVMOVAPSrm:
+ case X86::FsVMOVAPDrm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm: {
+ // Loads from constant pools are trivially rematerializable.
+ if (MI->getOperand(1).isReg() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ MI->isInvariantLoad(AA)) {
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == 0 || BaseReg == X86::RIP)
+ return true;
+ // Allow re-materialization of PIC load.
+ if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+ return false;
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+
+ case X86::LEA32r:
+ case X86::LEA64r: {
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ !MI->getOperand(4).isReg()) {
+ // lea fi#, lea GV, etc. are all rematerializable.
+ if (!MI->getOperand(1).isReg())
+ return true;
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of lea PICBase + x.
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+ }
+
+ // All other instructions marked M_REMATERIALIZABLE are always trivially
+ // rematerializable.
+ return true;
+}
+
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// a few instructions in each direction it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator E = MBB.end();
+
+ // For compile time consideration, if we are not able to determine the
+ // safety after visiting 4 instructions in each direction, we will assume
+ // it's not safe.
+ MachineBasicBlock::iterator Iter = I;
+ for (unsigned i = 0; Iter != E && i < 4; ++i) {
+ bool SeenDef = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SeenDef = true;
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == X86::EFLAGS) {
+ if (MO.isUse())
+ return false;
+ SeenDef = true;
+ }
+ }
+
+ if (SeenDef)
+ // This instruction defines EFLAGS, no need to look any further.
+ return true;
+ ++Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != E && Iter->isDebugValue())
+ ++Iter;
+ }
+
+ // It is safe to clobber EFLAGS at the end of a block of no successor has it
+ // live in.
+ if (Iter == E) {
+ for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+ SE = MBB.succ_end(); SI != SE; ++SI)
+ if ((*SI)->isLiveIn(X86::EFLAGS))
+ return false;
+ return true;
+ }
+
+ MachineBasicBlock::iterator B = MBB.begin();
+ Iter = I;
+ for (unsigned i = 0; i < 4; ++i) {
+ // If we make it to the beginning of the block, it's safe to clobber
+ // EFLAGS iff EFLAGS is not live-in.
+ if (Iter == B)
+ return !MBB.isLiveIn(X86::EFLAGS);
+
+ --Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != B && Iter->isDebugValue())
+ --Iter;
+
+ bool SawKill = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ // A register mask may clobber EFLAGS, but we should still look for a
+ // live EFLAGS def.
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SawKill = true;
+ if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+ if (MO.isDef()) return MO.isDead();
+ if (MO.isKill()) SawKill = true;
+ }
+ }
+
+ if (SawKill)
+ // This instruction kills EFLAGS and doesn't redefine it, so
+ // there's no need to look further.
+ return true;
+ }
+
+ // Conservative answer.
+ return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig,
+ const TargetRegisterInfo &TRI) const {
+ DebugLoc DL = Orig->getDebugLoc();
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // Re-materialize them as movri instructions to avoid side effects.
+ bool Clone = true;
+ unsigned Opc = Orig->getOpcode();
+ switch (Opc) {
+ default: break;
+ case X86::MOV8r0:
+ case X86::MOV16r0:
+ case X86::MOV32r0:
+ case X86::MOV64r0: {
+ if (!isSafeToClobberEFLAGS(MBB, I)) {
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOV8r0: Opc = X86::MOV8ri; break;
+ case X86::MOV16r0: Opc = X86::MOV16ri; break;
+ case X86::MOV32r0: Opc = X86::MOV32ri; break;
+ case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
+ }
+ Clone = false;
+ }
+ break;
+ }
+ }
+
+ if (Clone) {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+ MBB.insert(I, MI);
+ } else {
+ BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
+ }
+
+ MachineInstr *NewMI = prior(I);
+ NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDef() &&
+ MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
+/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
+/// to a 32-bit superregister and then truncating back down to a 16-bit
+/// subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isKill = MI->getOperand(1).isKill();
+
+ unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::LEA64_32r : X86::LEA32r;
+ MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+ unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ // This has the potential to cause partial register stall. e.g.
+ // movw (%rbp,%rcx,2), %dx
+ // leal -65(%rdx), %esi
+ // But testing has shown this *does* help performance in 64-bit mode (at
+ // least on modern x86 machines).
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+ MachineInstr *InsMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+ .addReg(Src, getKillRegState(isKill));
+
+ MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+ get(Opc), leaOutReg);
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHL16ri: {
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ MIB.addReg(0).addImm(1 << ShAmt)
+ .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+ break;
+ }
+ case X86::INC16r:
+ case X86::INC64_16r:
+ addRegOffset(MIB, leaInReg, true, 1);
+ break;
+ case X86::DEC16r:
+ case X86::DEC64_16r:
+ addRegOffset(MIB, leaInReg, true, -1);
+ break;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+ break;
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ unsigned leaInReg2 = 0;
+ MachineInstr *InsMI2 = 0;
+ if (Src == Src2) {
+ // ADD16rr %reg1028<kill>, %reg1028
+ // just a single insert_subreg.
+ addRegReg(MIB, leaInReg, true, leaInReg, false);
+ } else {
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
+ InsMI2 =
+ BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+ .addReg(Src2, getKillRegState(isKill2));
+ addRegReg(MIB, leaInReg, true, leaInReg2, true);
+ }
+ if (LV && isKill2 && InsMI2)
+ LV->replaceKillInstruction(Src2, MI, InsMI2);
+ break;
+ }
+ }
+
+ MachineInstr *NewMI = MIB;
+ MachineInstr *ExtMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+ if (LV) {
+ // Update live variables
+ LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+ LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, InsMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, ExtMI);
+ }
+
+ return ExtMI;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ // All instructions input are two-addr instructions. Get the known operands.
+ const MachineOperand &Dest = MI->getOperand(0);
+ const MachineOperand &Src = MI->getOperand(1);
+
+ MachineInstr *NewMI = NULL;
+ // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
+ // we have better subtarget support, enable the 16-bit LEA generation here.
+ // 16-bit LEA is also slow on Core2.
+ bool DisableLEA16 = true;
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+ unsigned MIOpc = MI->getOpcode();
+ switch (MIOpc) {
+ case X86::SHUFPSrri: {
+ assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ if (B != C) return 0;
+ unsigned M = MI->getOperand(3).getImm();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+ .addOperand(Dest).addOperand(Src).addImm(M);
+ break;
+ }
+ case X86::SHUFPDrri: {
+ assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ if (B != C) return 0;
+ unsigned M = MI->getOperand(3).getImm();
+
+ // Convert to PSHUFD mask.
+ M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
+
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+ .addOperand(Dest).addOperand(Src).addImm(M);
+ break;
+ }
+ case X86::SHL64ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ // LEA can't handle RSP.
+ if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(),
+ &X86::GR64_NOSPRegClass))
+ return 0;
+
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addOperand(Dest)
+ .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ // LEA can't handle ESP.
+ if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(),
+ &X86::GR32_NOSPRegClass))
+ return 0;
+
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+ break;
+ }
+ case X86::SHL16ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest)
+ .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+ break;
+ }
+ default: {
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return 0;
+
+ switch (MIOpc) {
+ default: return 0;
+ case X86::INC64r:
+ case X86::INC32r:
+ case X86::INC64_32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
+ (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+ (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
+
+ // LEA can't handle RSP.
+ if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+ return 0;
+
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest).addOperand(Src), 1);
+ break;
+ }
+ case X86::INC16r:
+ case X86::INC64_16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest).addOperand(Src), 1);
+ break;
+ case X86::DEC64r:
+ case X86::DEC32r:
+ case X86::DEC64_32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
+ (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+ (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
+ // LEA can't handle RSP.
+ if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+ return 0;
+
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest).addOperand(Src), -1);
+ break;
+ }
+ case X86::DEC16r:
+ case X86::DEC64_16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest).addOperand(Src), -1);
+ break;
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
+ Opc = X86::LEA64r;
+ RC = &X86::GR64_NOSPRegClass;
+ } else {
+ Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ RC = &X86::GR32_NOSPRegClass;
+ }
+
+
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+
+ // LEA can't handle RSP.
+ if (TargetRegisterInfo::isVirtualRegister(Src2) &&
+ !MF.getRegInfo().constrainRegClass(Src2, RC))
+ return 0;
+
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest),
+ Src.getReg(), Src.isKill(), Src2, isKill2);
+
+ // Preserve undefness of the operands.
+ bool isUndef = MI->getOperand(1).isUndef();
+ bool isUndef2 = MI->getOperand(2).isUndef();
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest),
+ Src.getReg(), Src.isKill(), Src2, isKill2);
+
+ // Preserve undefness of the operands.
+ bool isUndef = MI->getOperand(1).isUndef();
+ bool isUndef2 = MI->getOperand(2).isUndef();
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addOperand(Dest).addOperand(Src),
+ MI->getOperand(2).getImm());
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest).addOperand(Src),
+ MI->getOperand(2).getImm());
+ break;
+ }
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addOperand(Dest).addOperand(Src),
+ MI->getOperand(2).getImm());
+ break;
+ }
+ }
+ }
+
+ if (!NewMI) return 0;
+
+ if (LV) { // Update live variables
+ if (Src.isKill())
+ LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+ if (Dest.isDead())
+ LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
+ }
+
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+ switch (MI->getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+ case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+ case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+ }
+ unsigned Amt = MI->getOperand(3).getImm();
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ MI->getOperand(3).setImm(Size-Amt);
+ return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ }
+ case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
+ case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+ case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
+ case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+ case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+ case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
+ case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
+ case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+ case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+ case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
+ case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
+ case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+ case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
+ case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+ case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
+ case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
+ case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
+ case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
+ case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+ case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+ case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+ case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
+ case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
+ case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
+ case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+ case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+ case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+ case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+ case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+ case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+ case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
+ case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
+ case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
+ case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
+ case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
+ case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
+ case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+ case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+ case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+ case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+ case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+ case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+ case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
+ case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
+ case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
+ case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
+ case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
+ case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
+ case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+ case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+ case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+ case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
+ case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
+ case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
+ case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+ case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+ case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+ case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
+ case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
+ case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
+ case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+ case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+ case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+ }
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ // Fallthrough intended.
+ }
+ default:
+ return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ }
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default: return X86::COND_INVALID;
+ case X86::JE_4: return X86::COND_E;
+ case X86::JNE_4: return X86::COND_NE;
+ case X86::JL_4: return X86::COND_L;
+ case X86::JLE_4: return X86::COND_LE;
+ case X86::JG_4: return X86::COND_G;
+ case X86::JGE_4: return X86::COND_GE;
+ case X86::JB_4: return X86::COND_B;
+ case X86::JBE_4: return X86::COND_BE;
+ case X86::JA_4: return X86::COND_A;
+ case X86::JAE_4: return X86::COND_AE;
+ case X86::JS_4: return X86::COND_S;
+ case X86::JNS_4: return X86::COND_NS;
+ case X86::JP_4: return X86::COND_P;
+ case X86::JNP_4: return X86::COND_NP;
+ case X86::JO_4: return X86::COND_O;
+ case X86::JNO_4: return X86::COND_NO;
+ }
+}
+
+/// getCondFromSETOpc - return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::SETAr: case X86::SETAm: return X86::COND_A;
+ case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+ case X86::SETBr: case X86::SETBm: return X86::COND_B;
+ case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+ case X86::SETEr: case X86::SETEm: return X86::COND_E;
+ case X86::SETGr: case X86::SETGm: return X86::COND_G;
+ case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+ case X86::SETLr: case X86::SETLm: return X86::COND_L;
+ case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+ case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+ case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+ case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+ case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+ case X86::SETOr: case X86::SETOm: return X86::COND_O;
+ case X86::SETPr: case X86::SETPm: return X86::COND_P;
+ case X86::SETSr: case X86::SETSm: return X86::COND_S;
+ }
+}
+
+/// getCondFromCmovOpc - return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
+ case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
+ return X86::COND_A;
+ case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+ return X86::COND_AE;
+ case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
+ case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
+ return X86::COND_B;
+ case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+ return X86::COND_BE;
+ case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
+ case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
+ return X86::COND_E;
+ case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
+ case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
+ return X86::COND_G;
+ case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+ case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+ return X86::COND_GE;
+ case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
+ case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
+ return X86::COND_L;
+ case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+ case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+ return X86::COND_LE;
+ case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+ return X86::COND_NE;
+ case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+ case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+ return X86::COND_NO;
+ case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+ return X86::COND_NP;
+ case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+ case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+ return X86::COND_NS;
+ case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
+ case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
+ return X86::COND_O;
+ case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
+ case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
+ return X86::COND_P;
+ case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
+ case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
+ return X86::COND_S;
+ }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::JE_4;
+ case X86::COND_NE: return X86::JNE_4;
+ case X86::COND_L: return X86::JL_4;
+ case X86::COND_LE: return X86::JLE_4;
+ case X86::COND_G: return X86::JG_4;
+ case X86::COND_GE: return X86::JGE_4;
+ case X86::COND_B: return X86::JB_4;
+ case X86::COND_BE: return X86::JBE_4;
+ case X86::COND_A: return X86::JA_4;
+ case X86::COND_AE: return X86::JAE_4;
+ case X86::COND_S: return X86::JS_4;
+ case X86::COND_NS: return X86::JNS_4;
+ case X86::COND_P: return X86::JP_4;
+ case X86::COND_NP: return X86::JNP_4;
+ case X86::COND_O: return X86::JO_4;
+ case X86::COND_NO: return X86::JNO_4;
+ }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ }
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: return X86::COND_INVALID;
+ case X86::COND_E: return X86::COND_E;
+ case X86::COND_NE: return X86::COND_NE;
+ case X86::COND_L: return X86::COND_G;
+ case X86::COND_LE: return X86::COND_GE;
+ case X86::COND_G: return X86::COND_L;
+ case X86::COND_GE: return X86::COND_LE;
+ case X86::COND_B: return X86::COND_A;
+ case X86::COND_BE: return X86::COND_AE;
+ case X86::COND_A: return X86::COND_B;
+ case X86::COND_AE: return X86::COND_BE;
+ }
+}
+
+/// getSETFromCond - Return a set opcode for the given condition and
+/// whether it has memory operand.
+static unsigned getSETFromCond(X86::CondCode CC,
+ bool HasMemoryOperand) {
+ static const uint16_t Opc[16][2] = {
+ { X86::SETAr, X86::SETAm },
+ { X86::SETAEr, X86::SETAEm },
+ { X86::SETBr, X86::SETBm },
+ { X86::SETBEr, X86::SETBEm },
+ { X86::SETEr, X86::SETEm },
+ { X86::SETGr, X86::SETGm },
+ { X86::SETGEr, X86::SETGEm },
+ { X86::SETLr, X86::SETLm },
+ { X86::SETLEr, X86::SETLEm },
+ { X86::SETNEr, X86::SETNEm },
+ { X86::SETNOr, X86::SETNOm },
+ { X86::SETNPr, X86::SETNPm },
+ { X86::SETNSr, X86::SETNSm },
+ { X86::SETOr, X86::SETOm },
+ { X86::SETPr, X86::SETPm },
+ { X86::SETSr, X86::SETSm }
+ };
+
+ assert(CC < 16 && "Can only handle standard cond codes");
+ return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand) {
+ static const uint16_t Opc[32][3] = {
+ { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
+ { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+ { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
+ { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+ { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
+ { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
+ { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+ { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
+ { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+ { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+ { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+ { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+ { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+ { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
+ { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
+ { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
+ { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
+ { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+ { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
+ { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+ { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
+ { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
+ { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+ { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
+ { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+ { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+ { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+ { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+ { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+ { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
+ { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
+ { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
+ };
+
+ assert(CC < 16 && "Can only handle standard cond codes");
+ unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+ switch(RegBytes) {
+ default: llvm_unreachable("Illegal register size!");
+ case 2: return Opc[Idx][0];
+ case 4: return Opc[Idx][1];
+ case 8: return Opc[Idx][2];
+ }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+ if (!MI->isTerminator()) return false;
+
+ // Conditional branch is a special case.
+ if (MI->isBranch() && !MI->isBarrier())
+ return true;
+ if (!MI->isPredicable())
+ return true;
+ return !isPredicated(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isUnpredicatedTerminator(I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == X86::JMP_4) {
+ UnCondBrIter = I;
+
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (llvm::next(I) != MBB.end())
+ llvm::next(I)->eraseFromParent();
+
+ Cond.clear();
+ FBB = 0;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = 0;
+ I->eraseFromParent();
+ I = MBB.end();
+ UnCondBrIter = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditional destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+ if (AllowModify && UnCondBrIter != MBB.end() &&
+ MBB.isLayoutSuccessor(TargetBB)) {
+ // If we can modify the code and it ends in something like:
+ //
+ // jCC L1
+ // jmp L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Then we can change this to:
+ //
+ // jnCC L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Which is a bit more efficient.
+ // We conditionally jump to the fall-through block.
+ BranchCode = GetOppositeBranchCondition(BranchCode);
+ unsigned JNCC = GetCondBranchFromCond(BranchCode);
+ MachineBasicBlock::iterator OldInst = I;
+
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
+ .addMBB(TargetBB);
+
+ OldInst->eraseFromParent();
+ UnCondBrIter->eraseFromParent();
+
+ // Restart the analysis.
+ UnCondBrIter = MBB.end();
+ I = MBB.end();
+ continue;
+ }
+
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination and their condition
+ // opcodes fit one of the special multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // Only handle the case where all conditional branches branch to the same
+ // destination.
+ if (TBB != I->getOperand(0).getMBB())
+ return true;
+
+ // If the conditions are the same, we can leave them alone.
+ X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+ if (OldBranchCode == BranchCode)
+ continue;
+
+ // If they differ, see if they fit one of the known patterns. Theoretically,
+ // we could handle more patterns here, but we shouldn't expect to see them
+ // if instruction selection has done a reasonable job.
+ if ((OldBranchCode == X86::COND_NP &&
+ BranchCode == X86::COND_E) ||
+ (OldBranchCode == X86::COND_E &&
+ BranchCode == X86::COND_NP))
+ BranchCode = X86::COND_NP_OR_E;
+ else if ((OldBranchCode == X86::COND_P &&
+ BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_NE &&
+ BranchCode == X86::COND_P))
+ BranchCode = X86::COND_NE_OR_P;
+ else
+ return true;
+
+ // Update the MachineOperand.
+ Cond[0].setImm(BranchCode);
+ }
+
+ return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != X86::JMP_4 &&
+ getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ DebugLoc DL) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+ switch (CC) {
+ case X86::COND_NP_OR_E:
+ // Synthesize NP_OR_E with two branches.
+ BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB);
+ ++Count;
+ break;
+ case X86::COND_NE_OR_P:
+ // Synthesize NE_OR_P with two branches.
+ BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB);
+ ++Count;
+ break;
+ default: {
+ unsigned Opc = GetCondBranchFromCond(CC);
+ BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+ ++Count;
+ }
+ }
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg,
+ int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+ // Not all subtargets have cmov instructions.
+ if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+ return false;
+ if (Cond.size() != 1)
+ return false;
+ // We cannot do the composite conditions, at least not in SSA form.
+ if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+ if (X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR64RegClass.hasSubClassEq(RC)) {
+ // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+ // Bridge. Probably Ivy Bridge as well.
+ CondCycles = 2;
+ TrueCycles = 2;
+ FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DstReg,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ assert(Cond.size() == 1 && "Invalid Cond array");
+ unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+ MRI.getRegClass(DstReg)->getSize(),
+ false/*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+ return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+ bool HasAVX) {
+ // SrcReg(VR128) -> DestReg(GR64)
+ // SrcReg(VR64) -> DestReg(GR64)
+ // SrcReg(GR64) -> DestReg(VR128)
+ // SrcReg(GR64) -> DestReg(VR64)
+
+ if (X86::GR64RegClass.contains(DestReg)) {
+ if (X86::VR128RegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR64 register.
+ return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
+ if (X86::VR64RegClass.contains(SrcReg))
+ // Copy from a VR64 register to a GR64 register.
+ return X86::MOVSDto64rr;
+ } else if (X86::GR64RegClass.contains(SrcReg)) {
+ // Copy from a GR64 register to a VR128 register.
+ if (X86::VR128RegClass.contains(DestReg))
+ return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
+ // Copy from a GR64 register to a VR64 register.
+ if (X86::VR64RegClass.contains(DestReg))
+ return X86::MOV64toSDrr;
+ }
+
+ // SrcReg(FR32) -> DestReg(GR32)
+ // SrcReg(GR32) -> DestReg(FR32)
+
+ if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+ // Copy from a FR32 register to a GR32 register.
+ return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+
+ if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+ // Copy from a GR32 register to a FR32 register.
+ return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+
+ return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const {
+ // First deal with the normal symmetric copies.
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ unsigned Opc;
+ if (X86::GR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV64rr;
+ else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV32rr;
+ else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV16rr;
+ else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit()) {
+ Opc = X86::MOV8rr_NOREX;
+ // Both operands must be encodable without an REX prefix.
+ assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+ "8-bit H register can not be copied outside GR8_NOREX");
+ } else
+ Opc = X86::MOV8rr;
+ } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
+ else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MMX_MOVQ64rr;
+ else
+ Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ // Moving EFLAGS to / from another register requires a push and a pop.
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - colobbersTheStack.
+ if (SrcReg == X86::EFLAGS) {
+ if (X86::GR64RegClass.contains(DestReg)) {
+ BuildMI(MBB, MI, DL, get(X86::PUSHF64));
+ BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+ return;
+ }
+ if (X86::GR32RegClass.contains(DestReg)) {
+ BuildMI(MBB, MI, DL, get(X86::PUSHF32));
+ BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+ return;
+ }
+ }
+ if (DestReg == X86::EFLAGS) {
+ if (X86::GR64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(X86::PUSH64r))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::POPF64));
+ return;
+ }
+ if (X86::GR32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(X86::PUSH32r))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::POPF32));
+ return;
+ }
+ }
+
+ DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+ << " to " << RI.getName(DestReg) << '\n');
+ llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const TargetMachine &TM,
+ bool load) {
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ switch (RC->getSize()) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 1:
+ assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+ return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+ return load ? X86::MOV8rm : X86::MOV8mr;
+ case 2:
+ assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+ return load ? X86::MOV16rm : X86::MOV16mr;
+ case 4:
+ if (X86::GR32RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV32rm : X86::MOV32mr;
+ if (X86::FR32RegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+ (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ if (X86::RFP32RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+ llvm_unreachable("Unknown 4-byte regclass");
+ case 8:
+ if (X86::GR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV64rm : X86::MOV64mr;
+ if (X86::FR64RegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+ (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ if (X86::VR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+ if (X86::RFP64RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+ llvm_unreachable("Unknown 8-byte regclass");
+ case 10:
+ assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+ return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+ case 16: {
+ assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ?
+ (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+ (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+ else
+ return load ?
+ (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+ (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+ }
+ case 32:
+ assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+ else
+ return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+ }
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ TargetMachine &TM) {
+ return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const TargetMachine &TM) {
+ return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+ "Stack slot too small for store");
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+ bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = MMOBegin != MMOEnd &&
+ (*MMOBegin)->getAlignment() >= Alignment;
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ MIB.addReg(SrcReg, getKillRegState(isKill));
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = MMOBegin != MMOEnd &&
+ (*MMOBegin)->getAlignment() >= Alignment;
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+ int &CmpMask, int &CmpValue) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI->getOperand(1).getImm();
+ return true;
+ // A SUB can be used to perform comparison.
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = MI->getOperand(2).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI->getOperand(2).getImm();
+ return true;
+ case X86::CMP64rr:
+ case X86::CMP32rr:
+ case X86::CMP16rr:
+ case X86::CMP8rr:
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = MI->getOperand(1).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ SrcReg = MI->getOperand(0).getReg();
+ if (MI->getOperand(1).getReg() != SrcReg) return false;
+ // Compare against zero.
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ }
+ return false;
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+ unsigned SrcReg2, int ImmValue,
+ MachineInstr *OI) {
+ if (((FlagI->getOpcode() == X86::CMP64rr &&
+ OI->getOpcode() == X86::SUB64rr) ||
+ (FlagI->getOpcode() == X86::CMP32rr &&
+ OI->getOpcode() == X86::SUB32rr)||
+ (FlagI->getOpcode() == X86::CMP16rr &&
+ OI->getOpcode() == X86::SUB16rr)||
+ (FlagI->getOpcode() == X86::CMP8rr &&
+ OI->getOpcode() == X86::SUB8rr)) &&
+ ((OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getReg() == SrcReg2) ||
+ (OI->getOperand(1).getReg() == SrcReg2 &&
+ OI->getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+ OI->getOpcode() == X86::SUB64ri32) ||
+ (FlagI->getOpcode() == X86::CMP64ri8 &&
+ OI->getOpcode() == X86::SUB64ri8) ||
+ (FlagI->getOpcode() == X86::CMP32ri &&
+ OI->getOpcode() == X86::SUB32ri) ||
+ (FlagI->getOpcode() == X86::CMP32ri8 &&
+ OI->getOpcode() == X86::SUB32ri8) ||
+ (FlagI->getOpcode() == X86::CMP16ri &&
+ OI->getOpcode() == X86::SUB16ri) ||
+ (FlagI->getOpcode() == X86::CMP16ri8 &&
+ OI->getOpcode() == X86::SUB16ri8) ||
+ (FlagI->getOpcode() == X86::CMP8ri &&
+ OI->getOpcode() == X86::SUB8ri)) &&
+ OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+/// isDefConvertible - check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+ case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
+ case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
+ case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
+ case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
+ case X86::DEC64_32r: case X86::DEC64_16r:
+ case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+ case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
+ case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
+ case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
+ case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
+ case X86::INC64_32r: case X86::INC64_16r:
+ case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+ case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
+ case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
+ case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
+ case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+ case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
+ case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
+ case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
+ case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
+ case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
+ case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
+ case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::ANDN32rr: case X86::ANDN32rm:
+ case X86::ANDN64rr: case X86::ANDN64rm:
+ return true;
+ }
+}
+
+/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+ int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const {
+ // Check whether we can replace SUB with CMP.
+ unsigned NewOpcode = 0;
+ switch (CmpInstr->getOpcode()) {
+ default: break;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr: {
+ if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+ return false;
+ // There is no use of the destination register, we can replace SUB with CMP.
+ switch (CmpInstr->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
+ case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
+ case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
+ case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
+ case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
+ case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
+ case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
+ case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
+ case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+ case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
+ case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
+ case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
+ case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
+ case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
+ case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
+ }
+ CmpInstr->setDesc(get(NewOpcode));
+ CmpInstr->RemoveOperand(0);
+ // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+ if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+ NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+ return false;
+ }
+ }
+
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+
+ // CmpInstr is the first instruction of the BB.
+ MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+ // If we are comparing against zero, check whether we can use MI to update
+ // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+ bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+ if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
+ !isDefConvertible(MI)))
+ return false;
+
+ // We are searching for an earlier instruction that can make CmpInstr
+ // redundant and that instruction will be saved in Sub.
+ MachineInstr *Sub = NULL;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // We iterate backward, starting from the instruction before CmpInstr and
+ // stop when reaching the definition of a source register or done with the BB.
+ // RI points to the instruction before CmpInstr.
+ // If the definition is in this basic block, RE points to the definition;
+ // otherwise, RE is the rend of the basic block.
+ MachineBasicBlock::reverse_iterator
+ RI = MachineBasicBlock::reverse_iterator(I),
+ RE = CmpInstr->getParent() == MI->getParent() ?
+ MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+ CmpInstr->getParent()->rend();
+ MachineInstr *Movr0Inst = 0;
+ for (; RI != RE; ++RI) {
+ MachineInstr *Instr = &*RI;
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (!IsCmpZero &&
+ isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+ Sub = Instr;
+ break;
+ }
+
+ if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+ Instr->readsRegister(X86::EFLAGS, TRI)) {
+ // This instruction modifies or uses EFLAGS.
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // They are safe to move up, if the definition to EFLAGS is dead and
+ // earlier instructions do not read or write EFLAGS.
+ if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
+ Instr->getOpcode() == X86::MOV16r0 ||
+ Instr->getOpcode() == X86::MOV32r0 ||
+ Instr->getOpcode() == X86::MOV64r0) &&
+ Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+ Movr0Inst = Instr;
+ continue;
+ }
+
+ // We can't remove CmpInstr.
+ return false;
+ }
+ }
+
+ // Return false if no candidates exist.
+ if (!IsCmpZero && !Sub)
+ return false;
+
+ bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
+
+ // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+ // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+ // If we are done with the basic block, we need to check whether EFLAGS is
+ // live-out.
+ bool IsSafe = false;
+ SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+ MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+ for (++I; I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+ bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+ // We should check the usage if this instruction uses and updates EFLAGS.
+ if (!UseEFLAGS && ModifyEFLAGS) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again.
+ IsSafe = true;
+ break;
+ }
+ if (!UseEFLAGS && !ModifyEFLAGS)
+ continue;
+
+ // EFLAGS is used by this instruction.
+ X86::CondCode OldCC;
+ bool OpcIsSET = false;
+ if (IsCmpZero || IsSwapped) {
+ // We decode the condition code from opcode.
+ if (Instr.isBranch())
+ OldCC = getCondFromBranchOpc(Instr.getOpcode());
+ else {
+ OldCC = getCondFromSETOpc(Instr.getOpcode());
+ if (OldCC != X86::COND_INVALID)
+ OpcIsSET = true;
+ else
+ OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+ }
+ if (OldCC == X86::COND_INVALID) return false;
+ }
+ if (IsCmpZero) {
+ switch (OldCC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ // CF and OF are used, we can't perform this optimization.
+ return false;
+ }
+ } else if (IsSwapped) {
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+ // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ // We swap the condition code and synthesize the new opcode.
+ X86::CondCode NewCC = getSwappedCondition(OldCC);
+ if (NewCC == X86::COND_INVALID) return false;
+
+ // Synthesize the new opcode.
+ bool HasMemoryOperand = Instr.hasOneMemOperand();
+ unsigned NewOpc;
+ if (Instr.isBranch())
+ NewOpc = GetCondBranchFromCond(NewCC);
+ else if(OpcIsSET)
+ NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+ else {
+ unsigned DstReg = Instr.getOperand(0).getReg();
+ NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+ HasMemoryOperand);
+ }
+
+ // Push the MachineInstr to OpsToUpdate.
+ // If it is safe to remove CmpInstr, the condition code of these
+ // instructions will be modified.
+ OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+ }
+ if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+ IsSafe = true;
+ break;
+ }
+ }
+
+ // If EFLAGS is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if ((IsCmpZero || IsSwapped) && !IsSafe) {
+ MachineBasicBlock *MBB = CmpInstr->getParent();
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ SE = MBB->succ_end(); SI != SE; ++SI)
+ if ((*SI)->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+
+ // The instruction to be updated is either Sub or MI.
+ Sub = IsCmpZero ? MI : Sub;
+ // Move Movr0Inst to the place right before Sub.
+ if (Movr0Inst) {
+ Sub->getParent()->remove(Movr0Inst);
+ Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+ }
+
+ // Make sure Sub instruction defines EFLAGS and mark the def live.
+ unsigned LastOperand = Sub->getNumOperands() - 1;
+ assert(Sub->getNumOperands() >= 2 &&
+ Sub->getOperand(LastOperand).isReg() &&
+ Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
+ "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
+ Sub->getOperand(LastOperand).setIsDef(true);
+ Sub->getOperand(LastOperand).setIsDead(false);
+ CmpInstr->eraseFromParent();
+
+ // Modify the condition code of instructions in OpsToUpdate.
+ for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
+ OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+ return true;
+}
+
+/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr* X86InstrInfo::
+optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ if (FoldAsLoadDefReg == 0)
+ return 0;
+ // To be conservative, if there exists another load, clear the load candidate.
+ if (MI->mayLoad()) {
+ FoldAsLoadDefReg = 0;
+ return 0;
+ }
+
+ // Check whether we can move DefMI here.
+ DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+ assert(DefMI);
+ bool SawStore = false;
+ if (!DefMI->isSafeToMove(this, 0, SawStore))
+ return 0;
+
+ // We try to commute MI if possible.
+ unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
+ for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
+ // Collect information about virtual register operands of MI.
+ unsigned SrcOperandId = 0;
+ bool FoundSrcOperand = false;
+ for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg != FoldAsLoadDefReg)
+ continue;
+ // Do not fold if we have a subreg use or a def or multiple uses.
+ if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+ return 0;
+
+ SrcOperandId = i;
+ FoundSrcOperand = true;
+ }
+ if (!FoundSrcOperand) return 0;
+
+ // Check whether we can fold the def into SrcOperandId.
+ SmallVector<unsigned, 8> Ops;
+ Ops.push_back(SrcOperandId);
+ MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+ if (FoldMI) {
+ FoldAsLoadDefReg = 0;
+ return FoldMI;
+ }
+
+ if (Idx == 1) {
+ // MI was changed but it didn't help, commute it back!
+ commuteInstruction(MI, false);
+ return 0;
+ }
+
+ // Check whether we can commute MI and enable folding.
+ if (MI->isCommutable()) {
+ MachineInstr *NewMI = commuteInstruction(MI, false);
+ // Unable to commute.
+ if (!NewMI) return 0;
+ if (NewMI != MI) {
+ // New instruction. It doesn't need to be kept.
+ NewMI->eraseFromParent();
+ return 0;
+ }
+ }
+ }
+ return 0;
+}
+
+/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined. This is
+/// used for mapping:
+/// %xmm4 = V_SET0
+/// to:
+/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ unsigned Reg = MIB->getOperand(0).getReg();
+ MIB->setDesc(Desc);
+
+ // MachineInstr::addOperand() will insert explicit operands before any
+ // implicit operands.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ // But we don't trust that.
+ assert(MIB->getOperand(1).getReg() == Reg &&
+ MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+ return true;
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+ switch (MI->getOpcode()) {
+ case X86::SETB_C8r:
+ return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+ case X86::SETB_C16r:
+ return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+ case X86::SETB_C32r:
+ return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+ case X86::SETB_C64r:
+ return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+ case X86::V_SET0:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0SD:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+ case X86::AVX_SET0:
+ assert(HasAVX && "AVX not supported");
+ return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+ case X86::V_SETALLONES:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+ case X86::AVX2_SETALLONES:
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::TEST8ri_NOREX:
+ MI->setDesc(get(X86::TEST8ri));
+ return true;
+ }
+ return false;
+}
+
+MachineInstr*
+X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+ int FrameIx, uint64_t Offset,
+ const MDNode *MDPtr,
+ DebugLoc DL) const {
+ X86AddressMode AM;
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = FrameIx;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
+ addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
+ return &*MIB;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI,
+ const TargetInstrInfo &TII) {
+ // Create the base instruction with the memory operand as the first part.
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+
+ // Loop over the rest of the ri operands, converting them over.
+ unsigned NumOps = MI->getDesc().getNumOperands()-2;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI->getOperand(i+2);
+ MIB.addOperand(MO);
+ }
+ for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ MIB.addOperand(MO);
+ }
+ return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+ unsigned Opcode, unsigned OpNo,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI, const TargetInstrInfo &TII) {
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+ return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI, unsigned i,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ unsigned Size, unsigned Align) const {
+ const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+ bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
+ bool isTwoAddrFold = false;
+
+ // Atom favors register form of call. So, we do not fold loads into calls
+ // when X86Subtarget is Atom.
+ if (isCallRegIndirect &&
+ (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
+ return NULL;
+ }
+
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // FIXME: AsmPrinter doesn't know how to handle
+ // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+ if (MI->getOpcode() == X86::ADD32ri &&
+ MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ return NULL;
+
+ MachineInstr *NewMI = NULL;
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && i < 2 &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() &&
+ MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ isTwoAddrFold = true;
+ } else if (i == 0) { // If operand 0
+ unsigned Opc = 0;
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV64r0: Opc = X86::MOV64mi32; break;
+ case X86::MOV32r0: Opc = X86::MOV32mi; break;
+ case X86::MOV16r0: Opc = X86::MOV16mi; break;
+ case X86::MOV8r0: Opc = X86::MOV8mi; break;
+ }
+ if (Opc)
+ NewMI = MakeM0Inst(*this, Opc, MOs, MI);
+ if (NewMI)
+ return NewMI;
+
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (i == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (i == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ } else if (i == 3) {
+ OpcodeTablePtr = &RegOp2MemOpTable3;
+ }
+
+ // If table selected...
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ OpcodeTablePtr->find(MI->getOpcode());
+ if (I != OpcodeTablePtr->end()) {
+ unsigned Opcode = I->second.first;
+ unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+ if (Align < MinAlign)
+ return NULL;
+ bool NarrowToMOV32rm = false;
+ if (Size) {
+ unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
+ if (Size < RCSize) {
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return NULL;
+ // If this is a 64-bit load, but the spill slot is 32, then we can do
+ // a 32-bit load which is implicitly zero-extended. This likely is due
+ // to liveintervalanalysis remat'ing a load from stack slot.
+ if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+ return NULL;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
+ }
+ }
+
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ unsigned DstReg = NewMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
+ X86::sub_32bit));
+ else
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+ }
+ return NewMI;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing && !MI->isCopy())
+ dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+ return NULL;
+}
+
+/// hasPartialRegUpdate - Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+/// movss (%rdi), %xmm0
+/// cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+/// cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SS64rr:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SD64rr:
+ case X86::CVTSD2SSrr:
+ case X86::Int_CVTSD2SSrr:
+ case X86::CVTSS2SDrr:
+ case X86::Int_CVTSS2SDrr:
+ case X86::RCPSSr:
+ case X86::RCPSSr_Int:
+ case X86::ROUNDSDr:
+ case X86::ROUNDSDr_Int:
+ case X86::ROUNDSSr:
+ case X86::ROUNDSSr_Int:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSr_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ // AVX encoded versions
+ case X86::VCVTSD2SSrr:
+ case X86::Int_VCVTSD2SSrr:
+ case X86::VCVTSS2SDrr:
+ case X86::Int_VCVTSS2SDrr:
+ case X86::VRCPSSr:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSr_Int:
+ case X86::VRSQRTSSr:
+ case X86::VSQRTSSr:
+ return true;
+ }
+
+ return false;
+}
+
+/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI->getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI->readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any of the preceding 16 instructions are reading Reg, insert a
+ // dependency breaking instruction. The magic number is based on a few
+ // Nehalem experiments.
+ return 16;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ } else
+ return;
+ MI->addRegisterKilled(Reg, TRI, true);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ // Unless optimizing for size, don't fold to avoid partial
+ // register update stalls
+ if (!MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+ hasPartialRegUpdate(MI->getOpcode()))
+ return 0;
+
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ unsigned Size = MFI->getObjectSize(FrameIndex);
+ unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ unsigned RCSize = 0;
+ switch (MI->getOpcode()) {
+ default: return NULL;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+ }
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Size < RCSize)
+ return NULL;
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return NULL;
+
+ SmallVector<MachineOperand,4> MOs;
+ MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr *LoadMI) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ // Unless optimizing for size, don't fold to avoid partial
+ // register update stalls
+ if (!MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+ hasPartialRegUpdate(MI->getOpcode()))
+ return 0;
+
+ // Determine the alignment of the load.
+ unsigned Alignment = 0;
+ if (LoadMI->hasOneMemOperand())
+ Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+ else
+ switch (LoadMI->getOpcode()) {
+ case X86::AVX2_SETALLONES:
+ case X86::AVX_SET0:
+ Alignment = 32;
+ break;
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ Alignment = 16;
+ break;
+ case X86::FsFLD0SD:
+ Alignment = 8;
+ break;
+ case X86::FsFLD0SS:
+ Alignment = 4;
+ break;
+ default:
+ return 0;
+ }
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI->getOpcode()) {
+ default: return NULL;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return NULL;
+
+ // Make sure the subregisters match.
+ // Otherwise we risk changing the size of the load.
+ if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+ return NULL;
+
+ SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+ switch (LoadMI->getOpcode()) {
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX_SET0:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0SS: {
+ // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+ // Create a constant-pool entry and operands to load from it.
+
+ // Medium and large mode can't fold loads this way.
+ if (TM.getCodeModel() != CodeModel::Small &&
+ TM.getCodeModel() != CodeModel::Kernel)
+ return NULL;
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ PICBase = X86::RIP;
+ else
+ // FIXME: PICBase = getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return NULL;
+ }
+
+ // Create a constant-pool entry.
+ MachineConstantPool &MCP = *MF.getConstantPool();
+ Type *Ty;
+ unsigned Opc = LoadMI->getOpcode();
+ if (Opc == X86::FsFLD0SS)
+ Ty = Type::getFloatTy(MF.getFunction()->getContext());
+ else if (Opc == X86::FsFLD0SD)
+ Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+ else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+ else
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+ bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
+ const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+ Constant::getNullValue(Ty);
+ unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+ // Create operands to load from the constant pool entry.
+ MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+ MOs.push_back(MachineOperand::CreateImm(1));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ break;
+ }
+ default: {
+ if ((LoadMI->getOpcode() == X86::MOVSSrm ||
+ LoadMI->getOpcode() == X86::VMOVSSrm) &&
+ MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+ > 4)
+ // These instructions only load 32 bits, we can't fold them if the
+ // destination register is wider than 32 bits (4 bytes).
+ return NULL;
+ if ((LoadMI->getOpcode() == X86::MOVSDrm ||
+ LoadMI->getOpcode() == X86::VMOVSDrm) &&
+ MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+ > 8)
+ // These instructions only load 64 bits, we can't fold them if the
+ // destination register is wider than 64 bits (8 bytes).
+ return NULL;
+
+ // Folding a normal load. Just copy the load's address operands.
+ unsigned NumOps = LoadMI->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+ MOs.push_back(LoadMI->getOperand(i));
+ break;
+ }
+ }
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops) const {
+ // Check switch flag
+ if (NoFusing) return 0;
+
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ return true;
+ case X86::ADD32ri:
+ // FIXME: AsmPrinter doesn't know how to handle
+ // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+ if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ return false;
+ break;
+ }
+ }
+
+ if (Ops.size() != 1)
+ return false;
+
+ unsigned OpNum = Ops[0];
+ unsigned Opc = MI->getOpcode();
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+ if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ } else if (OpNum == 0) { // If operand 0
+ switch (Opc) {
+ case X86::MOV8r0:
+ case X86::MOV16r0:
+ case X86::MOV32r0:
+ case X86::MOV64r0: return true;
+ default: break;
+ }
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (OpNum == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (OpNum == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ } else if (OpNum == 3) {
+ OpcodeTablePtr = &RegOp2MemOpTable3;
+ }
+
+ if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
+ return true;
+ return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find(MI->getOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & TB_INDEX_MASK;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return false;
+ UnfoldLoad &= FoldedLoad;
+ if (UnfoldStore && !FoldedStore)
+ return false;
+ UnfoldStore &= FoldedStore;
+
+ const MCInstrDesc &MCID = get(Opc);
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ if (!MI->hasOneMemOperand() &&
+ RC == &X86::VR128RegClass &&
+ !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+ // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+ // conservatively assume the address is unaligned. That's bad for
+ // performance.
+ return false;
+ SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+ SmallVector<MachineOperand,2> BeforeOps;
+ SmallVector<MachineOperand,2> AfterOps;
+ SmallVector<MachineOperand,4> ImpOps;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (i >= Index && i < Index + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (Op.isReg() && Op.isImplicit())
+ ImpOps.push_back(Op);
+ else if (i < Index)
+ BeforeOps.push_back(Op);
+ else if (i > Index)
+ AfterOps.push_back(Op);
+ }
+
+ // Emit the load instruction.
+ if (UnfoldLoad) {
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(MI->memoperands_begin(),
+ MI->memoperands_end());
+ loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+ if (UnfoldStore) {
+ // Address operands cannot be marked isKill.
+ for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+ MachineOperand &MO = NewMIs[0]->getOperand(i);
+ if (MO.isReg())
+ MO.setIsKill(false);
+ }
+ }
+ }
+
+ // Emit the data processing instruction.
+ MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, DataMI);
+
+ if (FoldedStore)
+ MIB.addReg(Reg, RegState::Define);
+ for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+ MIB.addOperand(BeforeOps[i]);
+ if (FoldedLoad)
+ MIB.addReg(Reg);
+ for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+ MIB.addOperand(AfterOps[i]);
+ for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+ MachineOperand &MO = ImpOps[i];
+ MIB.addReg(MO.getReg(),
+ getDefRegState(MO.isDef()) |
+ RegState::Implicit |
+ getKillRegState(MO.isKill()) |
+ getDeadRegState(MO.isDead()) |
+ getUndefRegState(MO.isUndef()));
+ }
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri: {
+ MachineOperand &MO0 = DataMI->getOperand(0);
+ MachineOperand &MO1 = DataMI->getOperand(1);
+ if (MO1.getImm() == 0) {
+ unsigned NewOpc;
+ switch (DataMI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
+ case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
+ }
+ DataMI->setDesc(get(NewOpc));
+ MO1.ChangeToRegister(MO0.getReg(), false);
+ }
+ }
+ }
+ NewMIs.push_back(DataMI);
+
+ // Emit the store instruction.
+ if (UnfoldStore) {
+ const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(MI->memoperands_begin(),
+ MI->memoperands_end());
+ storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+ }
+
+ return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const {
+ if (!N->isMachineOpcode())
+ return false;
+
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find(N->getMachineOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & TB_INDEX_MASK;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ const MCInstrDesc &MCID = get(Opc);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ unsigned NumDefs = MCID.NumDefs;
+ std::vector<SDValue> AddrOps;
+ std::vector<SDValue> BeforeOps;
+ std::vector<SDValue> AfterOps;
+ DebugLoc dl = N->getDebugLoc();
+ unsigned NumOps = N->getNumOperands();
+ for (unsigned i = 0; i != NumOps-1; ++i) {
+ SDValue Op = N->getOperand(i);
+ if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (i < Index-NumDefs)
+ BeforeOps.push_back(Op);
+ else if (i > Index-NumDefs)
+ AfterOps.push_back(Op);
+ }
+ SDValue Chain = N->getOperand(NumOps-1);
+ AddrOps.push_back(Chain);
+
+ // Emit the load instruction.
+ SDNode *Load = 0;
+ if (FoldedLoad) {
+ EVT VT = *RC->vt_begin();
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ if (!(*MMOs.first) &&
+ RC == &X86::VR128RegClass &&
+ !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+ // Do not introduce a slow unaligned load.
+ return false;
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = (*MMOs.first) &&
+ (*MMOs.first)->getAlignment() >= Alignment;
+ Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+ VT, MVT::Other, AddrOps);
+ NewNodes.push_back(Load);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ // Emit the data processing instruction.
+ std::vector<EVT> VTs;
+ const TargetRegisterClass *DstRC = 0;
+ if (MCID.getNumDefs() > 0) {
+ DstRC = getRegClass(MCID, 0, &RI, MF);
+ VTs.push_back(*DstRC->vt_begin());
+ }
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ EVT VT = N->getValueType(i);
+ if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+ VTs.push_back(VT);
+ }
+ if (Load)
+ BeforeOps.push_back(SDValue(Load, 0));
+ std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+ SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+ NewNodes.push_back(NewNode);
+
+ // Emit the store instruction.
+ if (FoldedStore) {
+ AddrOps.pop_back();
+ AddrOps.push_back(SDValue(NewNode, 0));
+ AddrOps.push_back(Chain);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ if (!(*MMOs.first) &&
+ RC == &X86::VR128RegClass &&
+ !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+ // Do not introduce a slow unaligned store.
+ return false;
+ unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ bool isAligned = (*MMOs.first) &&
+ (*MMOs.first)->getAlignment() >= Alignment;
+ SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
+ isAligned, TM),
+ dl, MVT::Other, AddrOps);
+ NewNodes.push_back(Store);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex) const {
+ DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find(Opc);
+ if (I == MemOp2RegOpTable.end())
+ return 0;
+ bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+ bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return 0;
+ if (UnfoldStore && !FoldedStore)
+ return 0;
+ if (LoadRegIndex)
+ *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+ return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const {
+ if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+ return false;
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ switch (Opc1) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::FsVMOVAPSrm:
+ case X86::FsVMOVAPDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ break;
+ }
+ switch (Opc2) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSDrm:
+ case X86::FsVMOVAPSrm:
+ case X86::FsVMOVAPDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ break;
+ }
+
+ // Check if chain operands and base addresses match.
+ if (Load1->getOperand(0) != Load2->getOperand(0) ||
+ Load1->getOperand(5) != Load2->getOperand(5))
+ return false;
+ // Segment operands should match as well.
+ if (Load1->getOperand(4) != Load2->getOperand(4))
+ return false;
+ // Scale should be 1, Index should be Reg0.
+ if (Load1->getOperand(1) == Load2->getOperand(1) &&
+ Load1->getOperand(2) == Load2->getOperand(2)) {
+ if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+ return false;
+
+ // Now let's examine the displacements.
+ if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+ isa<ConstantSDNode>(Load2->getOperand(3))) {
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+ Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const {
+ assert(Offset2 > Offset1);
+ if ((Offset2 - Offset1) / 8 > 64)
+ return false;
+
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ if (Opc1 != Opc2)
+ return false; // FIXME: overly conservative?
+
+ switch (Opc1) {
+ default: break;
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return false;
+ }
+
+ EVT VT = Load1->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+ // have 16 of them to play with.
+ if (TM.getSubtargetImpl()->is64Bit()) {
+ if (NumLoads >= 3)
+ return false;
+ } else if (NumLoads) {
+ return false;
+ }
+ break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ if (NumLoads)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+ if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+ return true;
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+ // FIXME: Return false for x87 stack register classes for now. We can't
+ // allow any loads of these registers before FpGet_ST0_80.
+ return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+ RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Create the register. The code to initialize it is inserted
+ // later, by the CGBR pass (below).
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ X86FI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
+ { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
+ { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
+ { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
+ { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
+ { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
+ { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
+ { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
+ { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
+ { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
+ { X86::ORPSrm, X86::ORPDrm, X86::PORrm },
+ { X86::ORPSrr, X86::ORPDrr, X86::PORrr },
+ { X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
+ { X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
+ // AVX 128-bit support
+ { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
+ { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
+ { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
+ { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
+ { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
+ { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+ { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
+ { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
+ { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
+ { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
+ { X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
+ { X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
+ { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
+ { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
+ // AVX 256-bit support
+ { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
+ { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
+ { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
+ { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
+ { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
+ { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
+ { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
+ { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
+ { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
+ { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
+ { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
+ { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
+ { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+ { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
+ { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
+ { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
+ { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain) {
+ for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+ if (ReplaceableInstrs[i][domain-1] == opcode)
+ return ReplaceableInstrs[i];
+ return 0;
+}
+
+static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
+ for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
+ if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
+ return ReplaceableInstrsAVX2[i];
+ return 0;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+ uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+ uint16_t validDomains = 0;
+ if (domain && lookup(MI->getOpcode(), domain))
+ validDomains = 0xe;
+ else if (domain && lookupAVX2(MI->getOpcode(), domain))
+ validDomains = hasAVX2 ? 0xe : 0x6;
+ return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+ assert(Domain>0 && Domain<4 && "Invalid execution domain");
+ uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+ const uint16_t *table = lookup(MI->getOpcode(), dom);
+ if (!table) { // try the other table
+ assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
+ "256-bit vector operations only available in AVX2");
+ table = lookupAVX2(MI->getOpcode(), dom);
+ }
+ assert(table && "Cannot change domain");
+ MI->setDesc(get(table[Domain-1]));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(X86::NOOP);
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+ switch (opc) {
+ default: return false;
+ case X86::DIVSDrm:
+ case X86::DIVSDrm_Int:
+ case X86::DIVSDrr:
+ case X86::DIVSDrr_Int:
+ case X86::DIVSSrm:
+ case X86::DIVSSrm_Int:
+ case X86::DIVSSrr:
+ case X86::DIVSSrr_Int:
+ case X86::SQRTPDm:
+ case X86::SQRTPDr:
+ case X86::SQRTPSm:
+ case X86::SQRTPSr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSSm:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ // AVX instructions with high latency
+ case X86::VDIVSDrm:
+ case X86::VDIVSDrm_Int:
+ case X86::VDIVSDrr:
+ case X86::VDIVSDrr_Int:
+ case X86::VDIVSSrm:
+ case X86::VDIVSSrm_Int:
+ case X86::VDIVSSrr:
+ case X86::VDIVSSrr_Int:
+ case X86::VSQRTPDm:
+ case X86::VSQRTPDr:
+ case X86::VSQRTPSm:
+ case X86::VSQRTPSr:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ return true;
+ }
+}
+
+bool X86InstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr *DefMI, unsigned DefIdx,
+ const MachineInstr *UseMI, unsigned UseIdx) const {
+ return isHighLatencyDef(DefMI->getOpcode());
+}
+
+namespace {
+ /// CGBR - Create Global Base Reg pass. This initializes the PIC
+ /// global base register for x86-32.
+ struct CGBR : public MachineFunctionPass {
+ static char ID;
+ CGBR() : MachineFunctionPass(ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) {
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF.getTarget());
+
+ assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ // Only emit a global base reg in PIC mode.
+ if (TM->getRelocationModel() != Reloc::PIC_)
+ return false;
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+ // If we didn't need a GlobalBaseReg, don't insert code.
+ if (GlobalBaseReg == 0)
+ return false;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF.front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const X86InstrInfo *TII = TM->getInstrInfo();
+
+ unsigned PC;
+ if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
+ PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ else
+ PC = GlobalBaseReg;
+
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+ // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+ if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+ // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ }
+
+ return true;
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 PIC Global Base Reg Initialization";
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createGlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+ struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) {
+ X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ if (TLSBaseAddrReg)
+ I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+ else
+ I = SetRegister(I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+ I != E; ++I) {
+ Changed |= VisitNode(*I, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF->getTarget());
+ const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+ const X86InstrInfo *TII = TM->getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+ MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ is64Bit ? X86::RAX : X86::EAX)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I->eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF->getTarget());
+ const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+ const X86InstrInfo *TII = TM->getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+ ? &X86::GR64RegClass
+ : &X86::GR32RegClass);
+
+ // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+ MachineInstr *Next = I->getNextNode();
+ MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ *TLSBaseAddrReg)
+ .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+ return Copy;
+ }
+
+ virtual const char *getPassName() const {
+ return "Local Dynamic TLS Access Clean-up";
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..260f054
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,419 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+ class X86RegisterInfo;
+ class X86TargetMachine;
+
+namespace X86 {
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_A = 0,
+ COND_AE = 1,
+ COND_B = 2,
+ COND_BE = 3,
+ COND_E = 4,
+ COND_G = 5,
+ COND_GE = 6,
+ COND_L = 7,
+ COND_LE = 8,
+ COND_NE = 9,
+ COND_NO = 10,
+ COND_NP = 11,
+ COND_NS = 12,
+ COND_O = 13,
+ COND_P = 14,
+ COND_S = 15,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches to
+ // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs.
+ COND_NE_OR_P,
+ COND_NP_OR_E,
+
+ COND_INVALID
+ };
+
+ // Turn condition code into conditional branch opcode.
+ unsigned GetCondBranchFromCond(CondCode CC);
+
+ // Turn CMov opcode into condition code.
+ CondCode getCondFromCMovOpc(unsigned Opc);
+
+ /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+ /// e.g. turning COND_E to COND_NE.
+ CondCode GetOppositeBranchCondition(X86::CondCode CC);
+} // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_DLLIMPORT: // dllimport stub.
+ case X86II::MO_GOTPCREL: // rip-relative GOT reference.
+ case X86II::MO_GOT: // normal GOT reference.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
+ case X86II::MO_GOT: // isPICStyleGOT: other global.
+ case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+ case X86II::MO_TLVP: // ??? Pretty sure..
+ return true;
+ default:
+ return false;
+ }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImm() &&
+ (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+4 <= MI->getNumOperands() &&
+ MI->getOperand(Op ).isReg() && isScale(MI->getOperand(Op+1)) &&
+ MI->getOperand(Op+2).isReg() &&
+ (MI->getOperand(Op+3).isImm() ||
+ MI->getOperand(Op+3).isGlobal() ||
+ MI->getOperand(Op+3).isCPI() ||
+ MI->getOperand(Op+3).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+5 <= MI->getNumOperands() &&
+ MI->getOperand(Op+4).isReg() &&
+ isLeaMem(MI, Op);
+}
+
+class X86InstrInfo : public X86GenInstrInfo {
+ X86TargetMachine &TM;
+ const X86RegisterInfo RI;
+
+ /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+ /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+ ///
+ typedef DenseMap<unsigned,
+ std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+ RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+ RegOp2MemOpTableType RegOp2MemOpTable0;
+ RegOp2MemOpTableType RegOp2MemOpTable1;
+ RegOp2MemOpTableType RegOp2MemOpTable2;
+ RegOp2MemOpTableType RegOp2MemOpTable3;
+
+ /// MemOp2RegOpTable - Load / store unfolding opcode map.
+ ///
+ typedef DenseMap<unsigned,
+ std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+ MemOp2RegOpTableType MemOp2RegOpTable;
+
+ static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ unsigned RegOp, unsigned MemOp, unsigned Flags);
+
+public:
+ explicit X86InstrInfo(X86TargetMachine &tm);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+ /// extension instruction. That is, it's like a copy where it's legal for the
+ /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+ /// true, then it's expected the pre-extension value is available as a subreg
+ /// of the result register. This also returns the sub-register index in
+ /// SubIdx.
+ virtual bool isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const;
+
+ unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+ /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const;
+
+ unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+ /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const;
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+ AliasAnalysis *AA) const;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig,
+ const TargetRegisterInfo &TRI) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ /// commuteInstruction - We have a few instructions that must be hacked on to
+ /// commute them.
+ ///
+ virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+ // Branch analysis.
+ virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+ virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+ virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+ virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ DebugLoc DL) const;
+ virtual bool canInsertSelect(const MachineBasicBlock&,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned, unsigned, int&, int&, int&) const;
+ virtual void insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DstReg,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg) const;
+ virtual void copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const;
+ virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const;
+
+ virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const;
+
+ virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+ virtual
+ MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+ int FrameIx, uint64_t Offset,
+ const MDNode *MDPtr,
+ DebugLoc DL) const;
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand(s). If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const;
+
+ /// foldMemoryOperand - Same as the previous version except it allows folding
+ /// of any load and store from / to any address, not just from a specific
+ /// stack slot.
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr* LoadMI) const;
+
+ /// canFoldMemoryOperand - Returns true if the specified load / store is
+ /// folding is possible.
+ virtual bool canFoldMemoryOperand(const MachineInstr*,
+ const SmallVectorImpl<unsigned> &) const;
+
+ /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+ /// a store or a load and a store into two or more instruction. If this is
+ /// possible, returns true as well as the new instructions by reference.
+ virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const;
+
+ /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+ /// instruction after load / store are unfolded from an instruction of the
+ /// specified opcode. It returns zero if the specified unfolding is not
+ /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+ /// index of the operand which will hold the register holding the loaded
+ /// value.
+ virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = 0) const;
+
+ /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+ /// to determine if two loads are loading from the same base address. It
+ /// should only return true if the base pointers are the same and the
+ /// only differences between the two addresses are the offset. It also returns
+ /// the offsets by reference.
+ virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const;
+
+ /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+ /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+ /// be scheduled togther. On some targets if two loads are loading from
+ /// addresses in the same cache line, it's better if they are scheduled
+ /// together. This function takes two integers that represent the load offsets
+ /// from the common base address. It returns true if it decides it's desirable
+ /// to schedule the two loads together. "NumLoads" is the number of loads that
+ /// have already been scheduled after Load1.
+ virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const;
+
+ virtual void getNoopForMachoTarget(MCInst &NopInst) const;
+
+ virtual
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+ /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+ /// instruction that defines the specified register class.
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+ static bool isX86_64ExtendedReg(const MachineOperand &MO) {
+ if (!MO.isReg()) return false;
+ return X86II::isX86_64ExtendedReg(MO.getReg());
+ }
+
+ /// getGlobalBaseReg - Return a virtual register initialized with the
+ /// the global base register value. Output instructions required to
+ /// initialize the register in the function entry block, if necessary.
+ ///
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ std::pair<uint16_t, uint16_t>
+ getExecutionDomain(const MachineInstr *MI) const;
+
+ void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+
+ unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const;
+ void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const;
+
+ MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ unsigned OpNum,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ unsigned Size, unsigned Alignment) const;
+
+ bool isHighLatencyDef(int opc) const;
+
+ bool hasHighOperandLatency(const InstrItineraryData *ItinData,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr *DefMI, unsigned DefIdx,
+ const MachineInstr *UseMI, unsigned UseIdx) const;
+
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ unsigned &SrcReg2,
+ int &CmpMask, int &CmpValue) const;
+
+ /// optimizeCompareInstr - Check if there exists an earlier instruction that
+ /// operates on the same source operands and sets flags in the same way as
+ /// Compare; remove Compare if possible.
+ virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const;
+
+ /// optimizeLoadInstr - Try to remove the load by folding it to a register
+ /// operand at the use. We fold the load instructions if and only if the
+ /// def and use are in the same BB. We only look at one load and see
+ /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+ /// defined by the load we are trying to fold. DefMI returns the machine
+ /// instruction that defines FoldAsLoadDefReg, and the function returns
+ /// the machine instruction generated due to folding.
+ virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const;
+
+private:
+ MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ /// isFrameOperand - Return true and the FrameIndex if the specified
+ /// operand and follow operands form a reference to the stack frame.
+ bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+ int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..3380d8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,2197 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+ [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i8>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+ SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+ SDTCisVT<1, iPTR>,
+ SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+ [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+
+
+def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
+def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
+def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;
+
+def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+
+def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86vastart_save_xmm_regs :
+ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+ SDT_X86VASTART_SAVE_XMM_REGS,
+ [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+ SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad]>;
+
+def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+ SDTypeProfile<1, 1, [SDTCisInt<0>,
+ SDTCisPtrTy<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>;
+def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>;
+def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
+
+def X86blsi : SDNode<"X86ISD::BLSI", SDTIntUnaryOp>;
+def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
+def X86blsr : SDNode<"X86ISD::BLSR", SDTIntUnaryOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+ [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL,
+ [SDNPHasChain, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem"; let PredicateMethod = "isMem";
+}
+def X86Mem8AsmOperand : AsmOperandClass {
+ let Name = "Mem8"; let PredicateMethod = "isMem8";
+}
+def X86Mem16AsmOperand : AsmOperandClass {
+ let Name = "Mem16"; let PredicateMethod = "isMem16";
+}
+def X86Mem32AsmOperand : AsmOperandClass {
+ let Name = "Mem32"; let PredicateMethod = "isMem32";
+}
+def X86Mem64AsmOperand : AsmOperandClass {
+ let Name = "Mem64"; let PredicateMethod = "isMem64";
+}
+def X86Mem80AsmOperand : AsmOperandClass {
+ let Name = "Mem80"; let PredicateMethod = "isMem80";
+}
+def X86Mem128AsmOperand : AsmOperandClass {
+ let Name = "Mem128"; let PredicateMethod = "isMem128";
+}
+def X86Mem256AsmOperand : AsmOperandClass {
+ let Name = "Mem256"; let PredicateMethod = "isMem256";
+}
+
+// Gather mem operands
+def X86MemVX32Operand : AsmOperandClass {
+ let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+}
+def X86MemVY32Operand : AsmOperandClass {
+ let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+}
+def X86MemVX64Operand : AsmOperandClass {
+ let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+}
+def X86MemVY64Operand : AsmOperandClass {
+ let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+ let Name = "AbsMem";
+ let SuperClasses = [X86MemAsmOperand];
+}
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+let OperandType = "OPERAND_MEMORY" in {
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem : X86MemOperand<"printi8mem"> {
+ let ParserMatchClass = X86Mem8AsmOperand; }
+def i16mem : X86MemOperand<"printi16mem"> {
+ let ParserMatchClass = X86Mem16AsmOperand; }
+def i32mem : X86MemOperand<"printi32mem"> {
+ let ParserMatchClass = X86Mem32AsmOperand; }
+def i64mem : X86MemOperand<"printi64mem"> {
+ let ParserMatchClass = X86Mem64AsmOperand; }
+def i128mem : X86MemOperand<"printi128mem"> {
+ let ParserMatchClass = X86Mem128AsmOperand; }
+def i256mem : X86MemOperand<"printi256mem"> {
+ let ParserMatchClass = X86Mem256AsmOperand; }
+def f32mem : X86MemOperand<"printf32mem"> {
+ let ParserMatchClass = X86Mem32AsmOperand; }
+def f64mem : X86MemOperand<"printf64mem"> {
+ let ParserMatchClass = X86Mem64AsmOperand; }
+def f80mem : X86MemOperand<"printf80mem"> {
+ let ParserMatchClass = X86Mem80AsmOperand; }
+def f128mem : X86MemOperand<"printf128mem"> {
+ let ParserMatchClass = X86Mem128AsmOperand; }
+def f256mem : X86MemOperand<"printf256mem">{
+ let ParserMatchClass = X86Mem256AsmOperand; }
+
+// Gather mem operands
+def vx32mem : X86MemOperand<"printi32mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+ let ParserMatchClass = X86MemVX32Operand; }
+def vy32mem : X86MemOperand<"printi32mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+ let ParserMatchClass = X86MemVY32Operand; }
+def vx64mem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+ let ParserMatchClass = X86MemVX64Operand; }
+def vy64mem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+ let ParserMatchClass = X86MemVY64Operand; }
+}
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+ let PrintMethod = "printi8mem";
+ let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86Mem8AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<2>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+ let PrintMethod = "printi32mem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+ i32imm, i8imm);
+ let ParserMatchClass = X86Mem32AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+ ptr_rc_tailcall, i32imm, i8imm);
+ let ParserMatchClass = X86Mem64AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+ ParserMatchClass = X86AbsMemAsmOperand,
+ PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+def offset8 : Operand<i64>;
+def offset16 : Operand<i64>;
+def offset32 : Operand<i64>;
+def offset64 : Operand<i64>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+def SSECC : Operand<i8> {
+ let PrintMethod = "printSSECC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AVXCC : Operand<i8> {
+ let PrintMethod = "printAVXCC";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+ let SuperClasses = [ImmAsmOperand];
+ let RenderMethod = "addImmOperands";
+}
+
+class ImmZExtAsmOperandClass : AsmOperandClass {
+ let SuperClasses = [ImmAsmOperand];
+ let RenderMethod = "addImmOperands";
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF] |
+// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti16i8";
+ let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti32i8";
+}
+
+// [0, 0x000000FF]
+def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass {
+ let Name = "ImmZExtu32u8";
+}
+
+
+// [0, 0x0000007F] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i8";
+ let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+ ImmSExti64i32AsmOperand];
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16> {
+ let ParserMatchClass = ImmSExti16i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32> {
+ let ParserMatchClass = ImmSExti32i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant, and those 8 bits are unsigned.
+def u32u8imm : Operand<i32> {
+ let ParserMatchClass = ImmZExtu32u8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i32AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+ let PrintMethod = "printPCRelImm";
+ let ParserMatchClass = X86AbsMemAsmOperand;
+ let OperandType = "OPERAND_PCREL";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printi32mem";
+ let AsmOperandLowerMethod = "lower_lea64_32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex],
+ []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or,
+ frameindex, X86WrapperRIP],
+ []>;
+
+def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex,
+ X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasCMov : Predicate<"Subtarget->hasCMov()">;
+def NoCMov : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+
+def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES : Predicate<"Subtarget->hasAES()">;
+def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA : Predicate<"Subtarget->hasFMA()">;
+def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP : Predicate<"Subtarget->hasXOP()">;
+def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C : Predicate<"Subtarget->hasF16C()">;
+def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM : Predicate<"Subtarget->hasRTM()">;
+def HasHLE : Predicate<"Subtarget->hasHLE()">;
+def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX : Predicate<"Subtarget->hasADX()">;
+def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def In32BitMode : Predicate<"!Subtarget->is64Bit()">,
+ AssemblerPredicate<"!Mode64Bit", "32-bit mode">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">,
+ AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
+def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+ "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+ "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def OptForSize : Predicate<"OptForSize">;
+def OptForSpeed : Predicate<"!OptForSize">;
+def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
+def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
+def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
+def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
+def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
+def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
+def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
+def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
+def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS : PatLeaf<(i8 12)>;
+def X86_COND_O : PatLeaf<(i8 13)>;
+def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S : PatLeaf<(i8 15)>;
+
+let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs.
+ def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
+ def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
+ def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+}
+
+def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+ return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+ return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let neverHasSideEffects = 1, SchedRW = [WriteZero] in {
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+ def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
+ "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
+ def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
+ "nop{l}\t$zero", [], IIC_NOP>, TB;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+ "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE : I<0xC9, RawFrm,
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[In32BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64 : I<0xC9, RawFrm,
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG16>, OpSize;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG>, OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+ IIC_POP_MEM>, OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+ IIC_POP_MEM>;
+
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+ Requires<[In32BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+ IIC_PUSH_MEM>,
+ OpSize;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+ IIC_PUSH_MEM>;
+
+def PUSHi8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+ "push{l}\t$imm", [], IIC_PUSH_IMM>;
+def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize;
+def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+ "push{l}\t$imm", [], IIC_PUSH_IMM>;
+
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+ OpSize;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+ Requires<[In32BitMode]>;
+
+} // mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r : I<0x58, AddRegFrm,
+ (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+ IIC_POP_REG>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+ IIC_POP_MEM>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r : I<0x50, AddRegFrm,
+ (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+ IIC_PUSH_REG>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+ IIC_PUSH_MEM>;
+} // mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1,
+ SchedRW = [WriteStore] in {
+def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>;
+def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>;
+def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+ "push{q}\t$imm", [], IIC_PUSH_IMM>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+ Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+ Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+ mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
+ Requires<[In32BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+ mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
+ Requires<[In32BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+ (outs GR32:$dst), (ins GR32:$src),
+ "bswap{l}\t$dst",
+ [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "bswap{q}\t$dst",
+ [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+ IIC_BSF>, TB, OpSize, Sched<[WriteShift]>;
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+ IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>;
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB,
+ Sched<[WriteShift]>;
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+ IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+ IIC_BSF>, TB, Sched<[WriteShift]>;
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+ IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+ TB, OpSize, Sched<[WriteShift]>;
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+ IIC_BSR>, TB,
+ OpSize, Sched<[WriteShiftLd]>;
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB,
+ Sched<[WriteShift]>;
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+ IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB,
+ Sched<[WriteShift]>;
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+ IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>;
+def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>;
+
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>;
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>;
+
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let neverHasSideEffects = 1 in {
+def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, imm:$src)], IIC_MOV>;
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "movabs{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, imm:$src)], IIC_MOV>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a
+/// 32-bit offset from the PC. These are only valid in x86-32 mode.
+let SchedRW = [WriteALU] in {
+def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
+ "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
+ Requires<[In32BitMode]>;
+def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
+ "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
+ Requires<[In32BitMode]>;
+def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
+ "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
+ Requires<[In32BitMode]>;
+def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
+ "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
+ Requires<[In32BitMode]>;
+def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
+ "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
+ Requires<[In32BitMode]>;
+def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
+ "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
+ Requires<[In32BitMode]>;
+}
+
+// FIXME: These definitions are utterly broken
+// Just leave them commented out for now because they're useless outside
+// of the large code model, and most compilers won't generate the instructions
+// in question.
+/*
+def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
+ "mov{q}\t{$src, %rax|RAX, $src}", []>;
+def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "mov{q}\t{$src, %rax|RAX, $src}", []>;
+def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
+ "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
+def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
+*/
+
+
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+ (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>,
+ Sched<[WriteMove]>;
+let mayStore = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+ (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, neverHasSideEffects = 1,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+ (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
+ [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
+ IIC_AHF>; // AH = flags
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+ OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi16 addr:$src1), GR16:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, OpSize, TB, Requires<[FastBTMem]>;
+ def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi32 addr:$src1), GR32:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, TB, Requires<[FastBTMem]>;
+ def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi64 addr:$src1), GR64:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+ IIC_BT_RI>, OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+ IIC_BT_RI>, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+ IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+ ], IIC_BT_MI>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+ ], IIC_BT_MI>, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+ i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+ InstrItinClass itin> {
+ let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin>;
+ def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize;
+ def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>;
+ def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+ itin>;
+ }
+}
+
+defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+ "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+ "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+ "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+ "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+ "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+ "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ Requires<[In32BitMode]>;
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+ "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ Requires<[In64BitMode]>;
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+ "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+ OpSize;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+ OpSize;
+def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB, OpSize;
+def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB;
+def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB, OpSize;
+def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB;
+def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+ "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+ "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+ TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+ Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+ Requires<[In32BitMode]>;
+
+// ASCII Adjust AX Before Division
+// sets AL, AH and EFLAGS and uses AL and AH
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+ "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>;
+
+// ASCII Adjust AX After Multiply
+// sets AL, AH and EFLAGS and uses AL
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+ "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+ Requires<[In32BitMode]>;
+
+// Decimal Adjust AL after Addition
+// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+ Requires<[In32BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+ Requires<[In32BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
+ Requires<[In32BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>,
+ Requires<[In32BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+ Requires<[In32BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+ Requires<[In32BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+ let SchedRW = [WriteALULd] in {
+ def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+ OpSize, T8;
+ def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+ T8;
+ def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+ T8;
+ }
+ let SchedRW = [WriteStore] in {
+ def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+ OpSize, T8;
+ def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+ T8;
+ def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+ T8;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+ def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+ "rdrand{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB;
+ def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+ "rdrand{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB;
+ def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+ "rdrand{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+ def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+ "rdseed{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize, TB;
+ def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+ "rdseed{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdseed))]>, TB;
+ def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+ "rdseed{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+ def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+ OpSize;
+ def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize;
+
+ def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS;
+ def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+
+ def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+ OpSize;
+ def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize;
+
+ def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS;
+ def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+
+ def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+ RegisterClass RC, X86MemOperand x86memop, SDNode OpNode,
+ PatFrag ld_frag> {
+ def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V;
+ def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>,
+ T8, VEX_4V;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem,
+ X86blsr, loadi32>;
+ defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem,
+ X86blsr, loadi64>, VEX_W;
+ defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem,
+ X86blsmsk, loadi32>;
+ defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem,
+ X86blsmsk, loadi64>, VEX_W;
+ defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem,
+ X86blsi, loadi32>;
+ defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem,
+ X86blsi, loadi64>, VEX_W;
+}
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag> {
+ def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8, VEX_4VOp3;
+ def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8, VEX_4VOp3;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+ int_x86_bmi_bextr_32, loadi32>;
+ defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+ int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+ defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+ int_x86_bmi_bzhi_32, loadi32>;
+ defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+ int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag> {
+ def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+ VEX_4V;
+ def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+ defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+ int_x86_bmi_pdep_32, loadi32>, T8XD;
+ defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+ int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+ defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+ int_x86_bmi_pext_32, loadi32>, T8XS;
+ defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+ int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw", "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd", "cwtd", "att">;
+def : MnemonicAlias<"cdq", "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo", "cqto", "att">;
+
+// lret maps to lretl, it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretl", "att">;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz", "loope", "att">;
+def : MnemonicAlias<"loopnz", "loopne", "att">;
+
+def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+
+// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
+// all modes. However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode. Similar for "pop %bx"
+def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"repe", "rep", "att">;
+def : MnemonicAlias<"repz", "rep", "att">;
+def : MnemonicAlias<"repnz", "repne", "att">;
+
+def : MnemonicAlias<"retl", "ret", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retq", "ret", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a", "ud2", "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret", "iretl", "att">;
+def : MnemonicAlias<"sysret", "sysretl", "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdtl", "lgdt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdtq", "lgdt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidtl", "lidt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidtq", "lidt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdtl", "sgdt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdtq", "sgdt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidtl", "sidt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidtq", "sidt", "att">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz", "fcmove", "att">;
+def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
+def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
+def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
+def : MnemonicAlias<"fcomip", "fcompi", "att">;
+def : MnemonicAlias<"fildq", "fildll", "att">;
+def : MnemonicAlias<"fistpq", "fistpll", "att">;
+def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
+def : MnemonicAlias<"fldcww", "fldcw", "att">;
+def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
+def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
+def : MnemonicAlias<"fucomip", "fucompi", "att">;
+def : MnemonicAlias<"fwait", "wait", "att">;
+
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond>
+ : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+ !strconcat(Prefix, NewCond, Suffix)>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> {
+ def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb
+ def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete
+ def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe
+ def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae
+ def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae
+ def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle
+ def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge
+ def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne
+ def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp
+ def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp
+
+ def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb
+ def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta
+ def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl
+ def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>;
+def : InstAlias<"aam", (AAM8i8 10)>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>;
+
+// clr aliases.
+def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>;
+def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>;
+def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>;
+def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>;
+def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>;
+def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>;
+def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>;
+def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>;
+def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>;
+def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>;
+def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>;
+def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>;
+def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>;
+def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
+def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>;
+def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>;
+def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>;
+def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>;
+def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>;
+def : InstAlias<"fxch", (XCH_F ST1)>;
+def : InstAlias<"fcom", (COM_FST0r ST1)>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1)>;
+def : InstAlias<"fcomi", (COM_FIr ST1)>;
+def : InstAlias<"fcompi", (COM_FIPr ST1)>;
+def : InstAlias<"fucom", (UCOM_Fr ST1)>;
+def : InstAlias<"fucomp", (UCOM_FPr ST1)>;
+def : InstAlias<"fucomi", (UCOM_FIr ST1)>;
+def : InstAlias<"fucompi", (UCOM_FIPr ST1)>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+ (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+ (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd", ADD_FST0r>;
+defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub", SUB_FST0r>;
+defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
+defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul", MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
+defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
+defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi", COM_FIPr>;
+defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
+def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw" , (FNSTSW16r)>;
+
+// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst)>;
+def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst)>;
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imulw $imm, $r", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm)>;
+def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>;
+def : InstAlias<"imull $imm, $r", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm)>;
+def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>;
+def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
+def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb %dx", (IN8rr)>;
+def : InstAlias<"inw %dx", (IN16rr)>;
+def : InstAlias<"inl %dx", (IN32rr)>;
+def : InstAlias<"inb $port", (IN8ri i8imm:$port)>;
+def : InstAlias<"inw $port", (IN16ri i8imm:$port)>;
+def : InstAlias<"inl $port", (IN32ri i8imm:$port)>;
+
+
+// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
+def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move. All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>;
+def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq $src, $dst",
+ (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq $src, $dst",
+ (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsd with no operands (as opposed to the SSE scalar move of a double) is an
+// alias for movsl. (as in rep; movsd)
+def : InstAlias<"movsd", (MOVSD)>;
+
+// movsx aliases
+def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb %dx", (OUT8rr)>;
+def : InstAlias<"outw %dx", (OUT16rr)>;
+def : InstAlias<"outl %dx", (OUT32rr)>;
+def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>;
+def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>;
+def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
+def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
+
+def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
+
+/* FIXME: This is disabled because the asm matcher is currently incapable of
+ * matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..49721df
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,624 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+ IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+ IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+ IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+ IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+ IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecALU in {
+def MMX_PSADBW_ITINS : OpndItins<
+ IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+ IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+ IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+ IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+ IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+ IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+ IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+ IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+ IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+ // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ OpndItins itins, bit Commutable = 0> {
+ def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]> {
+ let isCommutable = Commutable;
+ }
+ def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId,
+ Intrinsic IntId2, ShiftOpndItins itins> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[WriteVecShift]>;
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+ (ins VR64:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>,
+ Sched<[WriteVecShift]>;
+ }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, OpndItins itins> {
+ def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+
+ def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64 (bitconvert (memopmmx addr:$src))))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, OpndItins itins> {
+ let isCommutable = 0 in
+ def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+ def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
+ def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, OpndItins itins, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, Domain d> {
+ def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2),
+ asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ NoItinerary, d>;
+ def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ NoItinerary, d>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+ [(int_x86_mmx_emms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector GR32:$src)))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+let canFoldAsLoad = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+ Sched<[WriteStore]>;
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+ [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (MMX_X86movd2w (x86mmx VR64:$src)))],
+ IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove] in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+ (outs GR64:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
+} // SchedRW
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))],
+ IIC_MMX_MOVQ_RM>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (x86mmx VR64:$src), addr:$dst)],
+ IIC_MMX_MOVQ_RM>;
+} // SchedRW
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (bitconvert
+ (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))))],
+ IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64
+ (scalar_to_vector
+ (i64 (bitconvert (x86mmx VR64:$src))))))],
+ IIC_MMX_MOVQ_RR>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [], IIC_MMX_MOVQ_RR>;
+} // SchedRW
+
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movntq\t{$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+ IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
+ (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (X86vzmovl (x86mmx
+ (scalar_to_vector (loadi32 addr:$src))))))],
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+ MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+ MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+ MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+ MMX_INTALUQ_ITINS, 1>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+ MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
+ MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+ MMX_PMUL_ITINS, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+ MMX_PMUL_ITINS, 1>;
+let isCommutable = 1 in
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+ MMX_PSADBW_ITINS, 1>;
+
+defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+ MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+ defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+ MMX_INTALU_ITINS>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+ MMX_SHIFT_ITINS>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+ MMX_SHIFT_ITINS>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+ MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+ MMX_SHIFT_ITINS>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+ MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
+ int_x86_mmx_punpckhbw,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
+ int_x86_mmx_punpckhwd,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
+ int_x86_mmx_punpckhdq,
+ MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
+ int_x86_mmx_punpcklbw,
+ MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
+ int_x86_mmx_punpcklwd,
+ MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+ int_x86_mmx_punpckldq,
+ MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+ MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+ MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+ MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+ MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+ IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+ imm:$src2))],
+ IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+ f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+ f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+ f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+ f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+ i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+let Constraints = "$src1 = $dst" in {
+ defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+ int_x86_sse_cvtpi2ps,
+ i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, TB;
+}
+
+// Extract / Insert
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+ (iPTR imm:$src2)))],
+ IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+ def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst),
+ (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ GR32:$src2, (iPTR imm:$src3)))],
+ IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+ def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst),
+ (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ (i32 (anyext (loadi16 addr:$src2))),
+ (iPTR imm:$src3)))],
+ IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+ (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+ (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+ IIC_MMX_MASKMOV>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+ IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..cce938b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,8379 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ // InstrSchedModel info.
+ X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+ OpndItins s = arg_s;
+ OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+ InstrItinClass arg_ri> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+ IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+ IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+ SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+ IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+ IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+ SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+ IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+ IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+ SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+ IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+ IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+ SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+ IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+ IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+ SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+ IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+ IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+ SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+ IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+ IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+ IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+ IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+ IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+ IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, X86MemOperand x86memop,
+ OpndItins itins,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in {
+ def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ }
+ def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ string asm, string SSEVer, string FPSizeStr,
+ Operand memopr, ComplexPattern mem_cpat,
+ OpndItins itins,
+ bit Is2Addr = 1> {
+ def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
+ RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
+ SSEVer, "_", OpcodeStr, FPSizeStr))
+ RC:$src1, mem_cpat:$src2))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag,
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+ Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+ itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+ string OpcodeStr, X86MemOperand x86memop,
+ list<dag> pat_rr, list<dag> pat_rm,
+ bit Is2Addr = 1> {
+ let isCommutable = 1, hasSideEffects = 0 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rr, NoItinerary, d>,
+ Sched<[WriteVecLogic]>;
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rm, NoItinerary, d>,
+ Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+ (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+ (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+ (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
+ def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+ def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+ [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX] in
+ def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+ def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+ def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+ def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+}
+
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX2] in
+ def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
+// is used instead. Register-to-register movss/movsd is not modeled as an
+// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
+// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string base_opc,
+ string asm_opr> {
+ def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ (scalar_to_vector RC:$src2))))],
+ IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
+
+ // For the disassembler
+ let isCodeGenOnly = 1, hasSideEffects = 0 in
+ def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string OpcodeStr> {
+ // AVX
+ defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+ VEX_4V, VEX_LIG;
+
+ def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+ VEX, VEX_LIG, Sched<[WriteStore]>;
+ // SSE1 & 2
+ let Constraints = "$src1 = $dst" in {
+ defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}">;
+ }
+
+ def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+ Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_pat, string OpcodeStr> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
+
+ let AddedComplexity = 20 in
+ defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+}
+
+// Patterns
+let Predicates = [HasAVX] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVS{S,D} to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (v4f32 (V_SET0)),
+ (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+ // MOVSDrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+ }
+ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+ (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+ (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0),
+ (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
+
+ // Move low f64 and clear high bits.
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (v2f64 (V_SET0)),
+ (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
+
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (v2i64 (V_SET0)),
+ (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
+
+ // Extract and store.
+ def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+ def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
+
+ // Shuffle with VMOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VMOVSSrr (v4i32 VR128:$src1),
+ (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VMOVSSrr (v4f32 VR128:$src1),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+ // 256-bit variants
+ def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+
+ // Shuffle with VMOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+ // 256-bit variants
+ def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+ (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+ sub_xmm)>;
+
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold cause
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm already zeros the high parts of the register.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+ // Shuffle with MOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSD to the lower bits.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSDrm already zeros the high parts of the register.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
+
+ // Shuffle with MOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold cause
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, Domain d,
+ OpndItins itins,
+ bit IsReMaterializable = 1> {
+let neverHasSideEffects = 1 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+ Sched<[WriteMove]>;
+let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+ Sched<[WriteLoad]>;
+}
+
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ TB, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ TB, OpSize, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ TB, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ TB, OpSize, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ TB, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ TB, OpSize, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ TB, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ TB, OpSize, VEX, VEX_L;
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ TB;
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ TB, OpSize;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ TB;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ TB, OpSize;
+
+let SchedRW = [WriteStore] in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+ def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+ def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+ def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
+ def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
+ def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+ def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+ def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+ def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86vzmovl
+ (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzmovl
+ (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v8f32 (X86vzmovl
+ (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4f64 (X86vzmovl
+ (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+}
+
+
+def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
+ (VMOVUPDYmr addr:$dst, VR256:$src)>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+ def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+ def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+ def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
+ def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+ (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in
+ def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+ def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+ (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX] in {
+ // 128-bit load/store
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (VMOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (VMOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+ // 256-bit load/store
+ def : Pat<(alignedloadv4i64 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv4i64 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+ // Special patterns for storing subvector extracts of lower 128-bits
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ def : Pat<(alignedstore (v2f64 (extract_subvector
+ (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4f32 (extract_subvector
+ (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v2i64 (extract_subvector
+ (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v4i32 (extract_subvector
+ (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v8i16 (extract_subvector
+ (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(alignedstore (v16i8 (extract_subvector
+ (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+ def : Pat<(store (v2f64 (extract_subvector
+ (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v4f32 (extract_subvector
+ (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v2i64 (extract_subvector
+ (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v4i32 (extract_subvector
+ (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v8i16 (extract_subvector
+ (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+ def : Pat<(store (v16i8 (extract_subvector
+ (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+ (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
+def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+let isCodeGenOnly = 1 in {
+ def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+ IIC_SSE_MOVA_P_RM>, VEX;
+ def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+ IIC_SSE_MOVA_P_RM>, VEX;
+}
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+ IIC_SSE_MOVA_P_RM>;
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+ IIC_SSE_MOVA_P_RM>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, string asm_opr,
+ InstrItinClass itin> {
+ def PSrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "s", asm_opr),
+ [(set VR128:$dst,
+ (psnode VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+ itin, SSEPackedSingle>, TB,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+ def PDrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "d", asm_opr),
+ [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))],
+ itin, SSEPackedDouble>, TB, OpSize,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, InstrItinClass itin> {
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ itin>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $dst|$dst, $src2}",
+ itin>;
+}
+
+let AddedComplexity = 20 in {
+ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+ IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [HasAVX] in {
+ // Shuffle with VMOVLPS
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (VMOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+ // Shuffle with VMOVLPD
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+ (VMOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPDmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+ def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
+ (iPTR 0))), addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+ // Shuffle with MOVLPS
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Shuffle with MOVLPD
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+ // Store patterns
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+ addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+ defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+ IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [HasAVX] in {
+ // VMOVHPS patterns
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+ // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // MOVHPS patterns
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+ def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
+ VEX_4V, Sched<[WriteShuffle]>;
+ def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
+ VEX_4V, Sched<[WriteShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+ def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+}
+
+let Predicates = [HasAVX] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+ (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // MOVLHPS patterns
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+ IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+ IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+ IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, OpndItins itins> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (OpNode SrcRC:$src))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ X86MemOperand x86memop, string asm, Domain d,
+ OpndItins itins> {
+let neverHasSideEffects = 1 in {
+ def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [], itins.rr, d>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1 in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2F]>;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // neverHasSideEffects = 1
+}
+
+defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>,
+ XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+ XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+ XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+ XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+ XD, VEX_4V, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(f32 (sint_to_fp GR32:$src)),
+ (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f32 (sint_to_fp GR64:$src)),
+ (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+ def : Pat<(f64 (sint_to_fp GR32:$src)),
+ (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f64 (sint_to_fp GR64:$src)),
+ (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+ "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+ "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+ string asm, OpndItins itins> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, OpndItins itins,
+ bit Is2Addr = 1> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+ int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+ int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V;
+defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V,
+ VEX_W;
+defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+ SSE_CVT_Scalar, 0>, XD, VEX_4V;
+defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+ SSE_CVT_Scalar, 0>, XD,
+ VEX_4V, VEX_W;
+
+let Constraints = "$src1 = $dst" in {
+ defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse_cvtsi2ss, i32mem, loadi32,
+ "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+ defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64,
+ "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+ defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+ "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+ defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+ "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+}
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W;
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+ ssmem, sse_load_f32, "cvtss2si",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ TB, VEX, Requires<[HasAVX]>;
+defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ TB, VEX, VEX_L, Requires<[HasAVX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>,
+ TB, Requires<[UseSSE2]>;
+
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let neverHasSideEffects = 1 in {
+def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+ Requires<[HasAVX]>;
+
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround FR64:$src))],
+ IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+ IIC_SSE_CVT_Scalar_RM>,
+ XD,
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+ VR128:$src1, sse_load_f64:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+ VR128:$src1, sse_load_f64:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let neverHasSideEffects = 1 in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RR>,
+ XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fextend FR32:$src)),
+ (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+ (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+ (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[HasAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+ (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+ Requires<[HasAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (fextend FR32:$src))],
+ IIC_SSE_CVT_Scalar_RR>, XS,
+ Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (extloadf32 addr:$src))],
+ IIC_SSE_CVT_Scalar_RM>, XS,
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64. This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+ (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+ (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+ IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX,
+ Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
+ Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
+ VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+}
+
+def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+ (memopv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ Sched<[WriteCvtF2ILd]>;
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (VCVTDQ2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (VCVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+ (VCVTDQ2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+ (VCVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (VCVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+ (VCVTTPS2DQrm addr:$src)>;
+
+ def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+ (VCVTDQ2PSYrr VR256:$src)>;
+ def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
+ (VCVTDQ2PSYrm addr:$src)>;
+
+ def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+ (VCVTTPS2DQYrr VR256:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
+ (VCVTTPS2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (CVTDQ2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (CVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+ (CVTDQ2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+ (CVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (CVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+ (CVTTPS2DQrm addr:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+ (VCVTTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+ (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>,
+ Sched<[WriteCvtF2ILd]>;
+
+// Convert packed single to packed double
+let Predicates = [HasAVX] in {
+ // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+let neverHasSideEffects = 1, mayLoad = 1 in
+def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ []>, VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
+ Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtdq2_pd_256
+ (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L,
+ Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
+ Sched<[WriteCvtI2F]>;
+}
+
+let neverHasSideEffects = 1, mayLoad = 1 in
+def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX 256-bit register conversion intrinsics
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+ (VCVTDQ2PDYrr VR128:$src)>;
+ def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
+def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2psx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+
+// YMM only
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+ (VCVTDQ2PSYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+ (VCVTDQ2PSYrm addr:$src)>;
+
+ // Match fround and fextend for 128/256-bit conversions
+ def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ (VCVTPD2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+ (VCVTPD2PSXrm addr:$src)>;
+ def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+ (VCVTPD2PSYrr VR256:$src)>;
+ def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+ (VCVTPD2PSYrm addr:$src)>;
+
+ def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+ (VCVTPS2PDrr VR128:$src)>;
+ def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+ (VCVTPS2PDYrr VR128:$src)>;
+ def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+ (VCVTPS2PDYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Match fround and fextend for 128 conversions
+ def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ (CVTPD2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+ (CVTPD2PSrm addr:$src)>;
+
+ def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+ (CVTPS2PDrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+ Operand CC, SDNode OpNode, ValueType VT,
+ PatFrag ld_frag, string asm, string asm_alt,
+ OpndItins itins> {
+ def rr : SIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm : SIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ [(set RC:$dst, (OpNode (VT RC:$src1),
+ (ld_frag addr:$src2), imm:$cc))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let neverHasSideEffects = 1 in {
+ def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RM>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
+ "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S>,
+ XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
+ "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S>, // same latency as 32 bit compare
+ XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
+ "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
+ XS;
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
+ "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSE_ALU_F32S>, // same latency as 32 bit compare
+ XD;
+}
+
+multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+ Intrinsic Int, string asm, OpndItins itins> {
+ def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+ [(set VR128:$dst, (Int VR128:$src1,
+ VR128:$src, imm:$cc))],
+ itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+ [(set VR128:$dst, (Int VR128:$src1,
+ (load addr:$src), imm:$cc))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+// Aliases to match intrinsics which expect XMM operand(s).
+defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+ "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S>,
+ XS, VEX_4V;
+defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+ "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S>, // same latency as f32
+ XD, VEX_4V;
+let Constraints = "$src1 = $dst" in {
+ defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F32S>, XS;
+ defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F32S>, // same latency as f32
+ XD;
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, X86MemOperand x86memop,
+ PatFrag ld_frag, string OpcodeStr, Domain d> {
+ def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR, d>,
+ Sched<[WriteFAdd]>;
+ def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ (ld_frag addr:$src2)))],
+ IIC_SSE_COMIS_RM, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+ defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
+ defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
+ VEX_LIG;
+ let Pattern = []<dag> in {
+ defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+ "comiss", SSEPackedSingle>, TB, VEX,
+ VEX_LIG;
+ defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+ "comisd", SSEPackedDouble>, TB, OpSize, VEX,
+ VEX_LIG;
+ }
+
+ defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+ load, "ucomiss", SSEPackedSingle>, TB, VEX;
+ defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+ load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
+
+ defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+ load, "comiss", SSEPackedSingle>, TB, VEX;
+ defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+ load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
+ defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, TB;
+ defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, TB, OpSize;
+
+ let Pattern = []<dag> in {
+ defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+ "comiss", SSEPackedSingle>, TB;
+ defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+ "comisd", SSEPackedDouble>, TB, OpSize;
+ }
+
+ defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+ load, "ucomiss", SSEPackedSingle>, TB;
+ defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+ load, "ucomisd", SSEPackedDouble>, TB, OpSize;
+
+ defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+ "comiss", SSEPackedSingle>, TB;
+ defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+ "comisd", SSEPackedDouble>, TB, OpSize;
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+ Operand CC, Intrinsic Int, string asm,
+ string asm_alt, Domain d> {
+ def rri : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
+ IIC_SSE_CMPP_RR, d>,
+ Sched<[WriteFAdd]>;
+ def rmi : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
+ IIC_SSE_CMPP_RM, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+ let neverHasSideEffects = 1 in {
+ def rri_alt : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+ asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>;
+ def rmi_alt : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+ asm_alt, [], IIC_SSE_CMPP_RM, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+ }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+ "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+ "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+ "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+ "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+ defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+ "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+ "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSEPackedSingle>, TB;
+ defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+ "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+ "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSEPackedDouble>, TB, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+ (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+ (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+ (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+ (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+ ValueType vt, string asm, PatFrag mem_frag,
+ Domain d, bit IsConvertibleToThreeAddress = 0> {
+ def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+ let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+ def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+ memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+ memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in {
+ defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
+ TB;
+ defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>,
+ TB, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+ (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (memopv2i64 addr:$src2), (i8 imm:$imm))),
+ (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+ // 256-bit patterns
+ def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+ def : Pat<(v8i32 (X86Shufp VR256:$src1,
+ (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+ def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+ def : Pat<(v4i64 (X86Shufp VR256:$src1,
+ (memopv4i64 addr:$src2), (i8 imm:$imm))),
+ (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+ (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Generic SHUFPD patterns
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (memopv2i64 addr:$src2), (i8 imm:$imm))),
+ (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+ PatFrag mem_frag, RegisterClass RC,
+ X86MemOperand x86memop, string asm,
+ Domain d> {
+ def rr : PI<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1, RC:$src2)))],
+ IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>;
+ def rm : PI<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1,
+ (mem_frag addr:$src2))))],
+ IIC_SSE_UNPCK, d>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
+ VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
+ VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
+ VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
+ VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in {
+ defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, TB;
+ defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, TB, OpSize;
+ defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, TB;
+ defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, TB, OpSize;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+ (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+ (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+ (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+ (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
+ // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
+ // problem is during lowering, where it's not possible to recognize the load
+ // fold cause it has two uses through a bitcast. One use disappears at isel
+ // time and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movddup VR128:$src)),
+ (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
+ // problem is during lowering, where it's not possible to recognize the load
+ // fold cause it has two uses through a bitcast. One use disappears at isel
+ // time and the fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movddup VR128:$src)),
+ (UNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
+ Domain d> {
+ def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+ Sched<[WriteVecLogic]>;
+ def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
+ IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+ "movmskps", SSEPackedSingle>, TB, VEX;
+ defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+ "movmskpd", SSEPackedDouble>, TB,
+ OpSize, VEX;
+ defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+ "movmskps", SSEPackedSingle>, TB,
+ VEX, VEX_L;
+ defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+ "movmskpd", SSEPackedDouble>, TB,
+ OpSize, VEX, VEX_L;
+
+ def : Pat<(i32 (X86fgetsign FR32:$src)),
+ (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(i64 (X86fgetsign FR32:$src)),
+ (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(i32 (X86fgetsign FR64:$src)),
+ (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(i64 (X86fgetsign FR64:$src)),
+ (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // Assembler Only
+ def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>;
+ def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedDouble>, TB,
+ OpSize, VEX, Sched<[WriteVecLogic]>;
+ def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+ "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>;
+ def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+ "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedDouble>, TB,
+ OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+ SSEPackedSingle>, TB;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+ SSEPackedDouble>, TB, OpSize;
+
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+ (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+ Requires<[UseSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+ (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+ Requires<[UseSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+ (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+ Requires<[UseSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+ (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+ Requires<[UseSSE2]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, OpndItins itins,
+ bit IsCommutable, bit Is2Addr> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))],
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+ ValueType OpVT128, ValueType OpVT256,
+ OpndItins itins, bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+ defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+ VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+ memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+ defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+ OpVT256, VR256, memopv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+ SSE_BIT_ITINS_P, 0>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
+///
+multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, OpndItins itins> {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
+ TB, VEX_4V;
+
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
+ TB, OpSize, VEX_4V;
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+ f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
+ TB;
+
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+ f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
+ TB, OpSize;
+ }
+}
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+ SSE_BIT_ITINS_P>;
+defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+ SSE_BIT_ITINS_P>;
+defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+ SSE_BIT_ITINS_P>;
+
+let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
+ defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
+ SSE_BIT_ITINS_P>;
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f256mem,
+ [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f256mem,
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (bc_v4i64 (v4f64 VR256:$src2))))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (memopv4i64 addr:$src2)))], 0>,
+ TB, OpSize, VEX_4V, VEX_L;
+
+ // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+ // are all promoted to v2i64, and the patterns are covered by the int
+ // version. This is needed in SSE only, because v2i64 isn't supported on
+ // SSE1, but only on SSE2.
+ defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem, [],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+
+ defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))], 0>,
+ TB, OpSize, VEX_4V;
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem,
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>, TB;
+
+ defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>, TB, OpSize;
+ }
+}
+
+defm AND : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+ defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins,
+ bit Is2Addr = 1> {
+ defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem,
+ itins.s, Is2Addr>, XS;
+ defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem,
+ itins.d, Is2Addr>, XD;
+}
+
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SizeItins itins> {
+let Predicates = [HasAVX] in {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR128, v4f32, f128mem, memopv4f32,
+ SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR128, v2f64, f128mem, memopv2f64,
+ SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, VR256, v8f32, f256mem, memopv8f32,
+ SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, VR256, v4f64, f256mem, memopv4f64,
+ SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+ v4f32, f128mem, memopv4f32, SSEPackedSingle,
+ itins.s, 1>, TB;
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+ v2f64, f128mem, memopv2f64, SSEPackedDouble,
+ itins.d, 1>, TB, OpSize;
+}
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SizeItins itins,
+ bit Is2Addr = 1> {
+ defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+ !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ itins.s, Is2Addr>, XS;
+ defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+ !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ itins.d, Is2Addr>, XD;
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>;
+let isCommutable = 0 in {
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>;
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>;
+ defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>;
+ defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>;
+}
+
+let isCodeGenOnly = 1 in {
+ defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
+ defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+}
+
+defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+
+let isCommutable = 0 in {
+ defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+ defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+
+ let isCommutable = 0 in {
+ defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+ defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+ defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+ defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+ }
+}
+
+let isCodeGenOnly = 1 in {
+ defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ let Constraints = "$src1 = $dst" in {
+ defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+ defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+ }
+}
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTP : OpndItins<
+ IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
+>;
+
+def SSE_SQRTS : OpndItins<
+ IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+ IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+ IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse1_fp_unop_s - SSE1 unops in scalar form.
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+ def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+ let mayLoad = 1 in {
+ def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1,f32mem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+ def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
+ // For scalar unary operations, fold a load into the operation
+ // only in OptForSize mode. It eliminates an instruction, but it also
+ // eliminates a whole-register clobber (the load), so it introduces a
+ // partial register update condition.
+ def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+ Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+ def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+ def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+ let mayLoad = 1 in {
+ def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1,f32mem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+ def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
+ // For scalar unary operations, fold a load into the operation
+ // only in OptForSize mode. It eliminates an instruction, but it also
+ // eliminates a whole-register clobber (the load), so it introduces a
+ // partial register update condition.
+ def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+ Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+ let Constraints = "$src1 = $dst" in {
+ def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [], itins.rr>, Sched<[itins.Sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
+multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+ Intrinsic V4F32Int, Intrinsic V8F32Int,
+ OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (V8F32Int VR256:$src))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+ itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_s - SSE2 unops in scalar form.
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+ def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ !strconcat("v", OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+ let mayLoad = 1 in {
+ def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1,f64mem:$src2),
+ !strconcat("v", OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2),
+ !strconcat("v", OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+ def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ // See the comments in sse1_fp_unop_s for why this is OptForSize.
+ def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
+ Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
+ def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+ itins.rr>, VEX, Sched<[itins.Sched]>;
+ def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
+ itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+ def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+ def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+ itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+ def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
+ SSE_SQRTS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd,
+ SSE_SQRTS>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>,
+ sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
+ int_x86_avx_rsqrt_ps_256, SSE_SQRTP>;
+defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
+ sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
+ int_x86_avx_rcp_ps_256, SSE_RCPP>;
+
+def : Pat<(f32 (fsqrt FR32:$src)),
+ (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (fsqrt (load addr:$src))),
+ (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[HasAVX, OptForSize]>;
+def : Pat<(f64 (fsqrt FR64:$src)),
+ (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fsqrt (load addr:$src))),
+ (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frsqrt FR32:$src)),
+ (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frsqrt (load addr:$src))),
+ (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frcp FR32:$src)),
+ (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frcp (load addr:$src))),
+ (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[HasAVX, OptForSize]>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
+ (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS VR128:$src, FR32)),
+ VR128)>;
+ def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+ (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+ def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
+ (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS VR128:$src, FR64)),
+ VR128)>;
+ def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+ (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+
+ def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+ (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS VR128:$src, FR32)),
+ VR128)>;
+ def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
+ (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+ def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+ (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS VR128:$src, FR32)),
+ VR128)>;
+ def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
+ (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+let Predicates = [UseSSE1] in {
+ def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+ (RSQRTSSr_Int VR128:$src, VR128:$src)>;
+ def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+ (RCPSSr_Int VR128:$src, VR128:$src)>;
+}
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2f64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v8f32 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4i64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_L;
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti{l}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
+ TB, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movnti{q}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
+ TB, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should these memory instructions be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+ IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
+
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP;
+
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+ "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+ TB, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+ "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+ TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+ "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+ TB, Requires<[HasSSE2]>;
+} // SchedRW
+
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
+def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX, VEX_L;
+def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>,
+ VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>,
+ VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX, VEX_L;
+let Predicates = [HasAVX] in {
+ def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX;
+ def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+
+// For Disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+ IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+ IIC_SSE_MOVU_P_RM>,
+ XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVU_P_MR>,
+ XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
+ def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+ (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+ (MOVDQUmr addr:$dst, VR128:$src)>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+ IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0,
+ bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+ Intrinsic IntId256, OpndItins itins,
+ bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+ defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
+ VR128, memopv2i64, i128mem, itins,
+ IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
+ i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+ defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
+ VR256, memopv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+ ShiftOpndItins itins,
+ bit Is2Addr = 1> {
+ // src2 is always 128-bit
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+ itins.rr>, Sched<[WriteVecShift]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
+ Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+ (ins RC:$src1, i32i8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>,
+ Sched<[WriteVecShift]>;
+}
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0, bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[itins.Sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (bitconvert (memop_frag addr:$src2)))))]>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1>;
+defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 1>;
+defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1>;
+defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 0>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0>;
+defm PMINUB : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PMINSW : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+
+// Intrinsic forms
+defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
+ int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
+defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
+ int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
+defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
+ int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
+ int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
+ int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
+ int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
+ int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
+ int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
+defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
+ int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
+ int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
+defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
+ int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
+defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
+ int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>;
+
+let Predicates = [HasAVX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+ VR256, memopv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let Predicates = [HasAVX] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+ // 128-bit logical shifts.
+ def VPSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
+ VEX_4V;
+ def VPSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
+ VEX_4V;
+ // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Predicates = [HasAVX]
+
+let Predicates = [HasAVX2] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR256, v16i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+ VR256, v8i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+ VR256, v4i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR256, v16i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+ VR256, v8i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+ VR256, v4i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR256, v16i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+ VR256, v8i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+ // 256-bit logical shifts.
+ def VPSLLDQYri : PDIi8<0x73, MRM7r,
+ (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+ "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR256:$dst,
+ (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
+ VEX_4V, VEX_L;
+ def VPSRLDQYri : PDIi8<0x73, MRM3r,
+ (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+ "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR256:$dst,
+ (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
+ VEX_4V, VEX_L;
+ // PSRADQYri doesn't exist in SSE[1-3].
+}
+} // Predicates = [HasAVX2]
+
+let Constraints = "$src1 = $dst" in {
+defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P>;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+ // 128-bit logical shifts.
+ def PSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pslldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
+ def PSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "psrldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
+ // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+ (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+ def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+ (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+ def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+ (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+
+ // Shift up / down and insert zero's.
+ def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
+ (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+ def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
+ (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+}
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
+ (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+ def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
+ (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+ (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+ def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+ (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+ def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+ (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+
+ // Shift up / down and insert zero's.
+ def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
+ (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+ def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
+ (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Pack Instructions
+//===---------------------------------------------------------------------===//
+
+defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
+ int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>;
+defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
+ int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>;
+defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
+ int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+ SDNode OpNode> {
+let Predicates = [HasAVX] in {
+ def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>;
+ def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2] in {
+ def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>;
+ def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+ def ri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>, Sched<[WriteShuffle]>;
+ def mi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF>,
+ Sched<[WriteShuffleLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (VPSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (PSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+ IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (OpNode VR128:$src1,
+ (bc_frag (memopv2i64
+ addr:$src2))))],
+ IIC_SSE_UNPCK>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, PatFrag bc_frag> {
+ def Yrr : PDI<opc, MRMSrcReg,
+ (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+ Sched<[WriteShuffle]>;
+ def Yrm : PDI<opc, MRMSrcMem,
+ (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (bc_frag (memopv4i64 addr:$src2))))]>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+ bc_v16i8, 0>, VEX_4V;
+ defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+ bc_v8i16, 0>, VEX_4V;
+ defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+ bc_v4i32, 0>, VEX_4V;
+ defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+ bc_v2i64, 0>, VEX_4V;
+
+ defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+ bc_v16i8, 0>, VEX_4V;
+ defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+ bc_v8i16, 0>, VEX_4V;
+ defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+ bc_v4i32, 0>, VEX_4V;
+ defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+ bc_v2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
+ bc_v32i8>, VEX_4V, VEX_L;
+ defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+ bc_v8i32>, VEX_4V, VEX_L;
+ defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+ bc_v4i64>, VEX_4V, VEX_L;
+
+ defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
+ bc_v32i8>, VEX_4V, VEX_L;
+ defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
+ bc_v16i16>, VEX_4V, VEX_L;
+ defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
+ bc_v8i32>, VEX_4V, VEX_L;
+ defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
+ bc_v4i64>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+ bc_v16i8>;
+ defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+ bc_v8i16>;
+ defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+ bc_v4i32>;
+ defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+ bc_v2i64>;
+
+ defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+ bc_v16i8>;
+ defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+ bc_v8i16>;
+ defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+ bc_v4i32>;
+ defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+ bc_v2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+ def rri : Ii8<0xC4, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ GR32:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>,
+ Sched<[WriteShuffle]>;
+ def rmi : Ii8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+ imm:$src3))], IIC_SSE_PINSRW>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))]>, TB, OpSize, VEX,
+ Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))], IIC_SSE_PEXTRW>,
+ Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX] in {
+ defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
+ def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>;
+}
+
+let Constraints = "$src1 = $dst" in
+ defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+ IIC_SSE_MOVMSK>, VEX;
+def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
+def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+ IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>;
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ Sched<[WriteMove]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+ Sched<[WriteMove]>;
+def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (vector_extract (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ VEX, Sched<[WriteLoad]>;
+def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+ Sched<[WriteMove]>;
+def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (vector_extract (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>,
+ VEX;
+
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let Predicates = [HasAVX] in
+def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ VEX, Sched<[WriteLoad]>;
+def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+ IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// Patterns and instructions to describe movd/movq to XMM register zero-extends
+//
+let SchedRW = [WriteMove] in {
+let AddedComplexity = 15 in {
+def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector GR32:$src)))))],
+ IIC_SSE_MOVDQ>, VEX;
+def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector GR64:$src)))))],
+ IIC_SSE_MOVDQ>,
+ VEX, VEX_W;
+}
+let AddedComplexity = 15 in {
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector GR32:$src)))))],
+ IIC_SSE_MOVDQ>;
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector GR64:$src)))))],
+ IIC_SSE_MOVDQ>;
+}
+} // SchedRW
+
+let AddedComplexity = 20, SchedRW = [WriteLoad] in {
+def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+ (loadi32 addr:$src))))))],
+ IIC_SSE_MOVDQ>, VEX;
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+ (loadi32 addr:$src))))))],
+ IIC_SSE_MOVDQ>;
+} // AddedComplexity, SchedRW
+
+let Predicates = [HasAVX] in {
+ // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (VMOVZDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VMOVZDI2PDIrm addr:$src)>;
+ }
+ // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+ (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ VEX, Requires<[HasAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+ IIC_SSE_MOVDQ>, XS,
+ Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>;
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// Store / copy lower 64-bits of a XMM register.
+//
+def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
+ Sched<[WriteStore]>;
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+let AddedComplexity = 20 in
+def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+ (loadi64 addr:$src))))))],
+ IIC_SSE_MOVDQ>,
+ XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
+
+let AddedComplexity = 20 in
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+ (loadi64 addr:$src))))))],
+ IIC_SSE_MOVDQ>,
+ XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+
+let Predicates = [HasAVX], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (VMOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (MOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i64 (alignedX86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
+ XS, VEX, Requires<[HasAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
+ XS, Requires<[UseSSE2]>;
+} // SchedRW
+
+let SchedRW = [WriteVecLogicLd] in {
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ XS, VEX, Requires<[HasAVX]>;
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>,
+ XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let AddedComplexity = 20 in {
+ let Predicates = [HasAVX] in {
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVZPQILo2PQIrm addr:$src)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (VMOVZPQILo2PQIrr VR128:$src)>;
+ }
+ let Predicates = [UseSSE2] in {
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVZPQILo2PQIrm addr:$src)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>;
+ }
+}
+
+// Instructions to match in the assembler
+let SchedRW = [WriteMove] in {
+def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVDQ>, VEX, VEX_W;
+def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVDQ>, VEX, VEX_W;
+// Recognize "movd" with GR64 destination, but encode as a "movq"
+def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVDQ>, VEX, VEX_W;
+} // SchedRW
+
+// Instructions for the disassembler
+// xr = XMM register
+// xm = mem64
+
+let SchedRW = [WriteMove] in {
+let Predicates = [HasAVX] in
+def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
+def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+ ValueType vt, RegisterClass RC, PatFrag mem_frag,
+ X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (vt (OpNode RC:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v4f32, VR128, memopv4f32, f128mem>, VEX;
+ defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v4f32, VR128, memopv4f32, f128mem>, VEX;
+ defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+ defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+ memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+ memopv4f32, f128mem>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (VMOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (VMOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (VMOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (VMOVSLDUPrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+ (VMOVSHDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+ (VMOVSHDUPYrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+ (VMOVSLDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+ (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (MOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (MOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+let neverHasSideEffects = 1 in
+def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (v2f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))],
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+ Sched<[WriteShuffle]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (v4f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))]>,
+ Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
+ defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+ // 256-bit version
+ def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+ (VMOVDDUPYrm addr:$src)>;
+ def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+ (VMOVDDUPYrm addr:$src)>;
+ def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
+ (VMOVDDUPYrm addr:$src)>;
+ def : Pat<(X86Movddup (v4i64 VR256:$src)),
+ (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+ def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+ def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+ VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "lddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+ IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, OpndItins itins,
+ bit Is2Addr = 1> {
+ def rr : I<0xD0, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm : I<0xD0, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+ f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
+ defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+ f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+ f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
+ defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+ f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
+ }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+ let ExeDomain = SSEPackedSingle in
+ defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+ f128mem, SSE_ALU_F32P>, TB, XD;
+ let ExeDomain = SSEPackedDouble in
+ defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+ f128mem, SSE_ALU_F64P>, TB, OpSize;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+ def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
+
+ def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+ def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
+
+ def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+ X86fhadd, 0>, VEX_4V;
+ defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+ X86fhsub, 0>, VEX_4V;
+ defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+ X86fhadd, 0>, VEX_4V, VEX_L;
+ defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+ X86fhsub, 0>, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+ X86fhadd, 0>, VEX_4V;
+ defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+ X86fhsub, 0>, VEX_4V;
+ defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+ X86fhadd, 0>, VEX_4V, VEX_L;
+ defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+ X86fhsub, 0>, VEX_4V, VEX_L;
+ }
+}
+
+let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in {
+ defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+ defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+ defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+ }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128> {
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
+ OpSize, Sched<[WriteVecALU]>;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
+ OpSize, Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256> {
+ def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId256 VR256:$src))]>,
+ OpSize, Sched<[WriteVecALU]>;
+
+ def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (IntId256
+ (bitconvert (memopv4i64 addr:$src))))]>, OpSize,
+ Sched<[WriteVecALULd]>;
+}
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+ VR128:$src))>;
+def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>;
+def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+ VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>;
+def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>;
+
+let Predicates = [HasAVX] in {
+ defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb",
+ int_x86_ssse3_pabs_b_128>, VEX;
+ defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw",
+ int_x86_ssse3_pabs_w_128>, VEX;
+ defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd",
+ int_x86_ssse3_pabs_d_128>, VEX;
+
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (VPABSBrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (VPABSWrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (VPABSDrr128 VR128:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb",
+ int_x86_avx2_pabs_b>, VEX, VEX_L;
+ defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw",
+ int_x86_avx2_pabs_w>, VEX, VEX_L;
+ defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd",
+ int_x86_avx2_pabs_d>, VEX, VEX_L;
+
+ def : Pat<(xor
+ (bc_v4i64 (v32i1sextv32i8)),
+ (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+ (VPABSBrr256 VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v16i1sextv16i16)),
+ (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+ (VPABSWrr256 VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v8i1sextv8i32)),
+ (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+ (VPABSDrr256 VR256:$src)>;
+}
+
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
+ int_x86_ssse3_pabs_b_128>;
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
+ int_x86_ssse3_pabs_w_128>;
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
+ int_x86_ssse3_pabs_d_128>;
+
+let Predicates = [HasSSSE3] in {
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (PABSBrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (PABSWrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (PABSDrr128 VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+ IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+ IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+ IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+ IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+ IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+ IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, OpndItins itins,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ OpSize, Sched<[itins.Sched]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, OpndItins itins,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize, Sched<[itins.Sched]>;
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2))))]>, OpSize,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256> {
+ let isCommutable = 1 in
+ def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+ OpSize;
+ def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1,
+ (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
+ memopv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
+ memopv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
+ memopv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
+ memopv2i64, i128mem,
+ SSE_PSHUFB, 0>, VEX_4V;
+ defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW, 0>, VEX_4V;
+ defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW, 0>, VEX_4V;
+ defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw_128,
+ SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
+ int_x86_ssse3_pmul_hr_sw_128,
+ SSE_PMULHRSW, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+ defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+ int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
+ defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+ int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
+ defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
+ int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
+}
+defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
+ int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSHUFB>;
+ defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW>;
+ defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW>;
+ defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
+}
+defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw_128,
+ SSE_PMULHRSW>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+ let neverHasSideEffects = 1 in {
+ def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>;
+ let mayLoad = 1 in
+ def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+ let neverHasSideEffects = 1 in {
+ def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, OpSize, Sched<[WriteShuffle]>;
+ let mayLoad = 1 in
+ def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ }
+}
+
+let Predicates = [HasAVX] in
+ defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+ defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+ defm PALIGN : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+ [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+ Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+ TB, Requires<[HasSSE3]>;
+let Uses = [ECX, EAX] in
+def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+ TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
+ Requires<[In32BitMode]>;
+def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+ OpSize;
+}
+
+multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId> {
+ def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
+ VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
+ VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
+ VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
+ VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
+ VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
+ VEX;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
+ int_x86_avx2_pmovsxbw>, VEX, VEX_L;
+defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
+ int_x86_avx2_pmovsxwd>, VEX, VEX_L;
+defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
+ int_x86_avx2_pmovsxdq>, VEX, VEX_L;
+defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
+ int_x86_avx2_pmovzxbw>, VEX, VEX_L;
+defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
+ int_x86_avx2_pmovzxwd>, VEX, VEX_L;
+defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
+ int_x86_avx2_pmovzxdq>, VEX, VEX_L;
+}
+
+defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
+defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
+defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
+defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
+defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
+defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+
+let Predicates = [HasAVX] in {
+ // Common patterns involving scalar load.
+ def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+ (VPMOVSXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+ (VPMOVSXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+ (VPMOVSXBWrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+ (VPMOVSXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+ (VPMOVSXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+ (VPMOVSXWDrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+ (VPMOVSXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+ (VPMOVSXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+ (VPMOVSXDQrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+ (VPMOVZXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+ (VPMOVZXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+ (VPMOVZXBWrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+ (VPMOVZXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+ (VPMOVZXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+ (VPMOVZXWDrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+ (VPMOVZXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+ (VPMOVZXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+ (VPMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ // Common patterns involving scalar load.
+ def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+ (PMOVSXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+ (PMOVSXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+ (PMOVSXBWrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+ (PMOVSXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+ (PMOVSXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+ (PMOVSXWDrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+ (PMOVSXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+ (PMOVSXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+ (PMOVSXDQrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+ (PMOVZXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+ (PMOVZXBWrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+ (PMOVZXBWrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+ (PMOVZXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+ (PMOVZXWDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+ (PMOVZXWDrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+ (PMOVZXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+ (PMOVZXDQrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+ (PMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
+ (VPMOVZXDQYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
+ (VPMOVZXWDYrr VR128:$src)>;
+ }
+
+ def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+}
+
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+ OpSize;
+}
+
+multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId> {
+ def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+ OpSize;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
+ VEX;
+defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
+ VEX;
+defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
+ VEX;
+defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
+ VEX;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
+ int_x86_avx2_pmovsxbd>, VEX, VEX_L;
+defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
+ int_x86_avx2_pmovsxwq>, VEX, VEX_L;
+defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
+ int_x86_avx2_pmovzxbd>, VEX, VEX_L;
+defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
+ int_x86_avx2_pmovzxwq>, VEX, VEX_L;
+}
+
+defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
+defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
+defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
+defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+
+let Predicates = [HasAVX] in {
+ // Common patterns involving scalar load
+ def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+ (VPMOVSXBDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+ (VPMOVSXWQrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+ (VPMOVZXBDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+ (VPMOVZXWQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ // Common patterns involving scalar load
+ def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+ (PMOVSXBDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+ (PMOVSXWQrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+ (PMOVZXBDrm addr:$src)>;
+ def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+ (PMOVZXWQrm addr:$src)>;
+}
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ // Expecting a i16 load any extended to i32 value.
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId (bitconvert
+ (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+ OpSize;
+}
+
+multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId> {
+ def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+
+ // Expecting a i16 load any extended to i32 value.
+ def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (IntId (bitconvert
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+ OpSize;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
+ VEX;
+defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
+ VEX;
+}
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
+ int_x86_avx2_pmovsxbq>, VEX, VEX_L;
+defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
+ int_x86_avx2_pmovzxbq>, VEX, VEX_L;
+}
+defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
+defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
+ def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
+
+ def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+ def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
+
+ def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+
+ def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
+ (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
+ (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
+ (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
+ (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
+ (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
+ (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
+ (VPMOVSXWDYrm addr:$src)>;
+ def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
+ (VPMOVSXDQYrm addr:$src)>;
+
+ def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXBDYrm addr:$src)>;
+ def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXBDYrm addr:$src)>;
+
+ def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXWQYrm addr:$src)>;
+ def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXWQYrm addr:$src)>;
+
+ def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXBQYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+ // Common patterns involving scalar load
+ def : Pat<(int_x86_sse41_pmovsxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXBQrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVZXBQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
+
+ def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+
+ // Common patterns involving scalar load
+ def : Pat<(int_x86_sse41_pmovsxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXBQrm addr:$src)>;
+
+ def : Pat<(int_x86_sse41_pmovzxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVZXBQrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVSXWDrm addr:$src)>;
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVSXWDrm addr:$src)>;
+ def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXBDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXWQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (extloadi32i16 addr:$src))))))),
+ (PMOVSXBQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVSXDQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVSXDQrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVSXBWrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVSXBWrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
+ def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
+
+ def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
+ def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
+
+ def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
+
+ def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
+ (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
+ (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
+ (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
+ (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
+ (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
+ (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
+
+ def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
+
+ def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVZXBWrm addr:$src)>;
+ def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVZXBWrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVZXBDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+ (VPMOVZXBQrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVZXWDrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVZXWDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVZXWQrm addr:$src)>;
+
+ def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVZXDQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVZXDQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+ (VPMOVZXDQrm addr:$src)>;
+
+ def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
+
+ def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXWDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXDQrm addr:$src)>;
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXWDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXDQrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXBWrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXBWrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXBDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXWQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (extloadi32i16 addr:$src))))))),
+ (VPMOVSXBQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
+
+ def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
+
+ def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVZXBWrm addr:$src)>;
+ def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVZXBWrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVZXBDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+ (PMOVZXBQrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVZXWDrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVZXWDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVZXWQrm addr:$src)>;
+
+ def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVZXDQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVZXDQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+ (PMOVZXDQrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+ OpSize;
+ let neverHasSideEffects = 1, mayStore = 1 in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+let Predicates = [HasAVX] in {
+ defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+ def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
+}
+
+defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+ let neverHasSideEffects = 1, mayStore = 1 in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+let Predicates = [HasAVX] in
+ defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+ addr:$dst)]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+ defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR64:$dst,
+ (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+ addr:$dst)]>, OpSize, REX_W;
+}
+
+let Predicates = [HasAVX] in
+ defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+ OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)]>, OpSize;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [HasAVX] in {
+ defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+ def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, OpSize, VEX;
+ }
+ defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+ imm:$src3))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+ OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+ imm:$src3)))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+ defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+ OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+ imm:$src3)))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+ defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+ defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insrtps VR128:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))]>, OpSize;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [HasAVX] in
+ defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+ let Constraints = "$src1 = $dst" in
+ defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ PatFrag mem_frag32, PatFrag mem_frag64,
+ Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+ // Intrinsic operation, reg.
+ // Vector intrinsic operation, reg
+ def PSr : SS4AIi8<opcps, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, mem
+ def PSm : SS4AIi8<opcps, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+ OpSize;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+ // Vector intrinsic operation, reg
+ def PDr : SS4AIi8<opcpd, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, mem
+ def PDm : SS4AIi8<opcpd, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+ OpSize;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain in {
+ // Operation, reg.
+ let hasSideEffects = 0 in
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ []>, OpSize;
+
+ // Intrinsic operation, reg.
+ def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, mem.
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Operation, reg.
+ let hasSideEffects = 0 in
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ []>, OpSize;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, mem.
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ OpSize;
+} // ExeDomain = GenericDomain
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+ // Intrinsic form
+ defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
+ memopv4f32, memopv2f64,
+ int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>, VEX;
+ defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
+ memopv8f32, memopv4f64,
+ int_x86_avx_round_ps_256,
+ int_x86_avx_round_pd_256>, VEX, VEX_L;
+ defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
+ int_x86_sse41_round_ss,
+ int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+
+ def : Pat<(ffloor FR32:$src),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+ def : Pat<(f32 (fnearbyint FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+ def : Pat<(f32 (frint FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc FR32:$src)),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+ def : Pat<(f64 (ftrunc FR64:$src)),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+ def : Pat<(v4f32 (ffloor VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x1))>;
+ def : Pat<(v4f32 (fnearbyint VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x2))>;
+ def : Pat<(v4f32 (frint VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc VR128:$src)),
+ (VROUNDPSr VR128:$src, (i32 0x3))>;
+
+ def : Pat<(v2f64 (ffloor VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x1))>;
+ def : Pat<(v2f64 (fnearbyint VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x2))>;
+ def : Pat<(v2f64 (frint VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc VR128:$src)),
+ (VROUNDPDr VR128:$src, (i32 0x3))>;
+
+ def : Pat<(v8f32 (ffloor VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x1))>;
+ def : Pat<(v8f32 (fnearbyint VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0xC))>;
+ def : Pat<(v8f32 (fceil VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x2))>;
+ def : Pat<(v8f32 (frint VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x4))>;
+ def : Pat<(v8f32 (ftrunc VR256:$src)),
+ (VROUNDYPSr VR256:$src, (i32 0x3))>;
+
+ def : Pat<(v4f64 (ffloor VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x1))>;
+ def : Pat<(v4f64 (fnearbyint VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0xC))>;
+ def : Pat<(v4f64 (fceil VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x2))>;
+ def : Pat<(v4f64 (frint VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x4))>;
+ def : Pat<(v4f64 (ftrunc VR256:$src)),
+ (VROUNDYPDr VR256:$src, (i32 0x3))>;
+}
+
+defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
+ memopv4f32, memopv2f64,
+ int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+let Constraints = "$src1 = $dst" in
+defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+ int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(ffloor FR32:$src),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+ def : Pat<(f64 (ffloor FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+ def : Pat<(f32 (fnearbyint FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ def : Pat<(f64 (fnearbyint FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ def : Pat<(f32 (fceil FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+ def : Pat<(f64 (fceil FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+ def : Pat<(f32 (frint FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ def : Pat<(f64 (frint FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ def : Pat<(f32 (ftrunc FR32:$src)),
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+ def : Pat<(f64 (ftrunc FR64:$src)),
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+ def : Pat<(v4f32 (ffloor VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x1))>;
+ def : Pat<(v4f32 (fnearbyint VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0xC))>;
+ def : Pat<(v4f32 (fceil VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x2))>;
+ def : Pat<(v4f32 (frint VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x4))>;
+ def : Pat<(v4f32 (ftrunc VR128:$src)),
+ (ROUNDPSr VR128:$src, (i32 0x3))>;
+
+ def : Pat<(v2f64 (ffloor VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x1))>;
+ def : Pat<(v2f64 (fnearbyint VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0xC))>;
+ def : Pat<(v2f64 (fceil VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x2))>;
+ def : Pat<(v2f64 (frint VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x4))>;
+ def : Pat<(v2f64 (ftrunc VR128:$src)),
+ (ROUNDPDr VR128:$src, (i32 0x3))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ OpSize, VEX;
+def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+ OpSize, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+ OpSize, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
+ OpSize, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+ OpSize;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+ def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
+ def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+ OpSize, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+ VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+ VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+ def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+ OpSize, XS;
+ def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, OpSize, XS;
+
+ def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+ XS;
+ def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+
+ def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+ XS;
+ def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128> {
+ def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+ def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+ int_x86_sse41_phminposuw>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+ int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+}
+
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256> {
+ let isCommutable = 1 in
+ def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize;
+ def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1,
+ (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+}
+
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+ let isCommutable = 0 in
+ defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
+ 0>, VEX_4V;
+ defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq,
+ 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in
+ defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
+ int_x86_avx2_packusdw>, VEX_4V, VEX_L;
+ defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
+ int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 0 in
+ defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
+ defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
+ memopv2i64, i128mem>;
+ defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
+ memopv2i64, i128mem>;
+ defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+ defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ OpSize;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (IntId RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+ OpSize;
+}
+
+let Predicates = [HasAVX] in {
+ let isCommutable = 0 in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
+ VR128, memopv4f32, f128mem, 0>, VEX_4V;
+ defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
+ int_x86_avx_blend_ps_256, VR256, memopv8f32,
+ f256mem, 0>, VEX_4V, VEX_L;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
+ VR128, memopv2f64, f128mem, 0>, VEX_4V;
+ defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
+ int_x86_avx_blend_pd_256,VR256, memopv4f64,
+ f256mem, 0>, VEX_4V, VEX_L;
+ }
+ defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
+ VR128, memopv2i64, i128mem, 0>, VEX_4V;
+ defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+ VR128, memopv2i64, i128mem, 0>, VEX_4V;
+ }
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+ VR128, memopv4f32, f128mem, 0>, VEX_4V;
+ let ExeDomain = SSEPackedDouble in
+ defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+ VR128, memopv2f64, f128mem, 0>, VEX_4V;
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+ VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in {
+ defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
+ VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+ VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ }
+}
+
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 0 in {
+ let ExeDomain = SSEPackedSingle in
+ defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
+ VR128, memopv4f32, f128mem>;
+ let ExeDomain = SSEPackedDouble in
+ defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
+ VR128, memopv2f64, f128mem>;
+ defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
+ VR128, memopv2i64, i128mem>;
+ defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+ VR128, memopv2i64, i128mem>;
+ }
+ let ExeDomain = SSEPackedSingle in
+ defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+ VR128, memopv4f32, f128mem>;
+ let ExeDomain = SSEPackedDouble in
+ defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+ VR128, memopv2f64, f128mem>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_frag, Intrinsic IntId> {
+ def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+ NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+
+ def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+ RC:$src3))],
+ NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+ memopv2f64, int_x86_sse41_blendvpd>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+ memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+ memopv4f32, int_x86_sse41_blendvps>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+ memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+ memopv2i64, int_x86_sse41_pblendvb>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+ memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+ (v16i8 VR128:$src2))),
+ (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+ (v4f32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+ (v2f64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+ (v8f32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+ (v4f64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+
+ def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
+ (imm:$mask))),
+ (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+ def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
+ (imm:$mask))),
+ (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
+ def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+ (imm:$mask))),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+ (imm:$mask))),
+ (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+ (imm:$mask))),
+ (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+ (v32i8 VR256:$src2))),
+ (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+ def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+ (imm:$mask))),
+ (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+}
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+ multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ X86MemOperand x86memop, Intrinsic IntId> {
+ def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ OpSize;
+
+ def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1,
+ (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
+ }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+ int_x86_sse41_blendvpd>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+ int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+ int_x86_sse41_pblendvb>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+ (v16i8 VR128:$src2))),
+ (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+ (v4f32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+ (v2f64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+
+ def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+ (imm:$mask))),
+ (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+ (imm:$mask))),
+ (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+ (imm:$mask))),
+ (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+
+}
+
+let Predicates = [HasAVX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+ OpSize, VEX;
+let Predicates = [HasAVX2] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+ OpSize, VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+ OpSize;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
+ def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ OpSize;
+ def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+ defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+ defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+ defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+ memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm> {
+ def REG : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+ imm:$src3))]>;
+ def MEM : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+ defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+ def rr : SS42AI<0x62, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
+ let mayLoad = 1 in
+ def rm :SS42AI<0x62, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm> {
+ def REG : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+ VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+ (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+ defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+ def rr : SS42AI<0x60, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x60, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+ defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+ def rr : SS42AI<0x63, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x63, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+ defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+ imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+ defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+ def rr : SS42AI<0x61, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x61, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+ defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+let Constraints = "$src1 = $dst" in {
+ def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i8mem:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32_8 GR32:$src1,
+ (load addr:$src2)))]>;
+ def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR8:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
+ def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i16mem:$src2),
+ "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32_16 GR32:$src1,
+ (load addr:$src2)))]>,
+ OpSize;
+ def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR16:$src2),
+ "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
+ OpSize;
+ def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32_32 GR32:$src1,
+ (load addr:$src2)))]>;
+ def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
+ def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i8mem:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc32_64_8 GR64:$src1,
+ (load addr:$src2)))]>,
+ REX_W;
+ def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR8:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
+ REX_W;
+ def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc32_64_64 GR64:$src1,
+ (load addr:$src2)))]>,
+ REX_W;
+ def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
+ REX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Is2Addr = 1> {
+ def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize;
+ def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+ defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
+ int_x86_aesni_aesenc, 0>, VEX_4V;
+ defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
+ int_x86_aesni_aesenclast, 0>, VEX_4V;
+ defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
+ int_x86_aesni_aesdec, 0>, VEX_4V;
+ defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
+ int_x86_aesni_aesdeclast, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
+ int_x86_aesni_aesenc>;
+ defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
+ int_x86_aesni_aesenclast>;
+ defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
+ int_x86_aesni_aesdec>;
+ defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
+ int_x86_aesni_aesdeclast>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+ def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>,
+ OpSize, VEX;
+ def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+ OpSize, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>,
+ OpSize;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+ OpSize;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+ def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ OpSize, VEX;
+ def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+ OpSize, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ OpSize;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+ OpSize;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (memopv2i64 addr:$src2), imm:$src3))]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (memopv2i64 addr:$src2), imm:$src3))]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+ (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
+
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+ (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
+
+ def : InstAlias<!strconcat("vpclmul", asm,
+ "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
+
+ def : InstAlias<!strconcat("vpclmul", asm,
+ "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
+ (ins VR128:$src, i8imm:$len, i8imm:$idx),
+ "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+ [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+ imm:$idx))]>, TB, OpSize;
+def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "extrq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+ VR128:$mask))]>, TB, OpSize;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+ "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+ [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
+ VR128:$src2, imm:$len, imm:$idx))]>, XD;
+def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "insertq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+ VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+ "movntss\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movntsd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int> :
+ AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (Int addr:$src))]>, VEX;
+
+// AVX2 adds register forms
+class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ Intrinsic Int> :
+ AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (Int VR128:$src))]>, VEX;
+
+let ExeDomain = SSEPackedSingle in {
+ def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
+ int_x86_avx_vbroadcast_ss>;
+ def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
+ int_x86_avx_vbroadcast_ss_256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
+ int_x86_avx_vbroadcast_sd_256>, VEX_L;
+def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
+ int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
+
+let ExeDomain = SSEPackedSingle in {
+ def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
+ int_x86_avx2_vbroadcast_ss_ps>;
+ def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
+ int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+ int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
+
+let Predicates = [HasAVX2] in
+def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
+ int_x86_avx2_vbroadcasti128>, VEX_L;
+
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+ (VBROADCASTF128 addr:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTF128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+ (bc_v4i32 (memopv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+ (bc_v16i8 (memopv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+ (bc_v8i16 (memopv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTF128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, i8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, VEX, VEX_L;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v4f32 (VEXTRACTF128rr
+ (v8f32 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v2f64 (VEXTRACTF128rr
+ (v4f64 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v2i64 (VEXTRACTF128rr
+ (v4i64 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v4i32 (VEXTRACTF128rr
+ (v8i32 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v8i16 (VEXTRACTF128rr
+ (v16i16 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v16i8 (VEXTRACTF128rr
+ (v32i8 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTF128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+ Intrinsic IntLd, Intrinsic IntLd256,
+ Intrinsic IntSt, Intrinsic IntSt256> {
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+ VEX_4V;
+ def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L;
+ def mr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+ int_x86_avx_maskload_ps,
+ int_x86_avx_maskload_ps_256,
+ int_x86_avx_maskstore_ps,
+ int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+ int_x86_avx_maskload_pd,
+ int_x86_avx_maskload_pd_256,
+ int_x86_avx_maskstore_pd,
+ int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop_f,
+ X86MemOperand x86memop_i, PatFrag i_frag,
+ Intrinsic IntVar, ValueType vt> {
+ def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop_i:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (IntVar RC:$src1,
+ (bitconvert (i_frag addr:$src2))))]>, VEX_4V;
+
+ def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX;
+ def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop_f:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+ memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+ defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+ memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+ memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+ defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+ memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+ (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+ (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
+ (i8 imm:$imm))),
+ (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
+ (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, VEX_4V, VEX_L;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
+ (i8 imm:$src3)))]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+ (memopv4f64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+ (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+ (memopv4i64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+ (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+ // Zero All YMM registers
+ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+ [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
+
+ // Zero Upper bits of YMM registers
+ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+ [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+ def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (Int VR128:$src))]>,
+ T8, OpSize, VEX;
+ let neverHasSideEffects = 1, mayLoad = 1 in
+ def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+ def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+ (ins RC:$src1, i32i8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+ TA, OpSize, VEX;
+ let neverHasSideEffects = 1, mayStore = 1 in
+ def mr : Ii8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ TA, OpSize, VEX;
+}
+
+let Predicates = [HasAVX, HasF16C] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop> {
+ let isCommutable = 1 in
+ def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ VEX_4V;
+ def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (IntId RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+ VEX_4V;
+}
+
+let isCommutable = 0 in {
+defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
+ VR128, memopv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
+ VR256, memopv4i64, i256mem>, VEX_L;
+}
+
+def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+ imm:$mask)),
+ (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
+ imm:$mask)),
+ (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ Intrinsic Int128, Intrinsic Int256> {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int128 VR128:$src))]>, VEX;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+ VEX, VEX_L;
+}
+
+defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+ int_x86_avx2_pbroadcastb_128,
+ int_x86_avx2_pbroadcastb_256>;
+defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+ int_x86_avx2_pbroadcastw_128,
+ int_x86_avx2_pbroadcastw_256>;
+defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+ int_x86_avx2_pbroadcastd_128,
+ int_x86_avx2_pbroadcastd_256>;
+defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+ int_x86_avx2_pbroadcastq_128,
+ int_x86_avx2_pbroadcastq_256>;
+
+let Predicates = [HasAVX2] in {
+ def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
+ (VPBROADCASTBrm addr:$src)>;
+ def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
+ (VPBROADCASTBYrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDrm addr:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDYrm addr:$src)>;
+ def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VPBROADCASTQrm addr:$src)>;
+ def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VPBROADCASTQYrm addr:$src)>;
+
+ def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+ (VPBROADCASTBrr VR128:$src)>;
+ def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+ (VPBROADCASTBYrr VR128:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+ (VPBROADCASTWrr VR128:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+ (VPBROADCASTWYrr VR128:$src)>;
+ def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+ (VPBROADCASTDrr VR128:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+ (VPBROADCASTDYrr VR128:$src)>;
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+ (VPBROADCASTQrr VR128:$src)>;
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+ (VPBROADCASTQYrr VR128:$src)>;
+ def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+ (VBROADCASTSSrr VR128:$src)>;
+ def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+ (VBROADCASTSSYrr VR128:$src)>;
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+ (VPBROADCASTQrr VR128:$src)>;
+ def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+ (VBROADCASTSDYrr VR128:$src)>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+ }
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSSrm addr:$src)>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ // 128bit broadcasts:
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+ (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+ (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT> {
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+ VEX_4V, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1,
+ (bitconvert (mem_frag addr:$src2)))))]>,
+ VEX_4V, VEX_L;
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT> {
+ def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+ VEX, VEX_L;
+ def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi (mem_frag addr:$src1),
+ (i8 imm:$src2))))]>, VEX, VEX_L;
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+ (i8 imm:$src3)))]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+ (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+ (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let neverHasSideEffects = 1 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (iPTR imm)),
+ (VINSERTI128rr VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+ (bc_v4i32 (memopv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+ (bc_v16i8 (memopv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+ (bc_v8i16 (memopv2i64 addr:$src2)),
+ (iPTR imm)),
+ (VINSERTI128rm VR256:$src1, addr:$src2,
+ (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, i8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
+ VEX, VEX_L;
+let neverHasSideEffects = 1, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v2i64 (VEXTRACTI128rr
+ (v4i64 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v4i32 (VEXTRACTI128rr
+ (v8i32 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v8i16 (VEXTRACTI128rr
+ (v16i16 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (v16i8 (VEXTRACTI128rr
+ (v32i8 VR256:$src1),
+ (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (VEXTRACTI128mr addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+ Intrinsic IntLd128, Intrinsic IntLd256,
+ Intrinsic IntSt128, Intrinsic IntSt256> {
+ def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+ def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L;
+ def mr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+ int_x86_avx2_maskload_d,
+ int_x86_avx2_maskload_d_256,
+ int_x86_avx2_maskstore_d,
+ int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+ int_x86_avx2_maskload_q,
+ int_x86_avx2_maskload_q_256,
+ int_x86_avx2_maskstore_q,
+ int_x86_avx2_maskstore_q_256>, VEX_W;
+
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, ValueType vt256> {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+ VEX_4V;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1,
+ (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
+ VEX_4V;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+ VEX_4V, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1,
+ (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
+ VEX_4V, VEX_L;
+}
+
+defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+ X86MemOperand memop128, X86MemOperand memop256> {
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+ (ins VR128:$src1, memop128:$src2, VR128:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX_4VOp3;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+ (ins RC256:$src1, memop256:$src2, RC256:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX_4VOp3, VEX_L;
+}
+
+let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 0000000..757dcd0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 0000000..89c1a68
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,978 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shl{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "shl{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "shl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+ OpSize;
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "shl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "shl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+ "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t$dst", [], IIC_SR>, OpSize;
+def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t$dst", [], IIC_SR>;
+def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // isConvertibleToThreeAddress = 1
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "shl{b}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "shl{w}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "shl{l}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "shl{q}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t$dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t$dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t$dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t$dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shr{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "shr{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "shr{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize;
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "shr{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "shr{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+ "shr{b}\t$dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t$dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize;
+def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t$dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>;
+def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t$dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "shr{b}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "shr{w}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "shr{l}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "shr{q}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t$dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t$dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,OpSize;
+def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t$dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t$dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (sra GR8:$src1, CL))],
+ IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (sra GR16:$src1, CL))],
+ IIC_SR>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (sra GR32:$src1, CL))],
+ IIC_SR>;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (sra GR64:$src1, CL))],
+ IIC_SR>;
+}
+
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "sar{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "sar{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>,
+ OpSize;
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "sar{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "sar{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t$dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t$dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize;
+def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t$dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+ IIC_SR>;
+def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "sar{b}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "sar{w}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "sar{l}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "sar{q}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Shift by 1
+def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t$dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t$dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t$dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t$dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t$dst", [], IIC_SR>;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t$dst", [], IIC_SR>;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t$dst", [], IIC_SR>;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t$dst", [], IIC_SR>;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "rol{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "rol{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>,
+ OpSize;
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "rol{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "rol{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize;
+def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+ IIC_SR>;
+def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t{%cl, $dst|$dst, CL}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t{%cl, $dst|$dst, CL}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t{%cl, $dst|$dst, CL}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t{%cl, $dst|$dst, %cl}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
+ "rol{b}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
+ "rol{w}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
+ "rol{l}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
+ "rol{q}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
+
+// Rotate by 1
+def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t$dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t$dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t$dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t$dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "ror{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "ror{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>,
+ OpSize;
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "ror{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "ror{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t$dst",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+ IIC_SR>;
+def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t$dst",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize;
+def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t$dst",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+ IIC_SR>;
+def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t$dst",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+ IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
+}
+def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "ror{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "ror{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "ror{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "ror{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
+
+// Rotate by 1
+def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t$dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t$dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
+ OpSize;
+def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t$dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t$dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
+ TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
+ TB, OpSize;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
+ TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
+ TB;
+}
+
+let isCommutable = 1 in { // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+ TB, OpSize;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+ TB;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+ TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB;
+
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
+ TB, OpSize;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
+ TB, OpSize;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
+ TB;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
+ TB;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
+ TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+ return getI8Imm(32 - N->getZExtValue());
+}]>;
+
+def ROT64L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+ return getI8Imm(64 - N->getZExtValue());
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+ def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, i8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+ def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX_4VOp3, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX_4VOp3,
+ Sched<[WriteShiftLd,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src1
+ ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+ defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+ defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+ defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+ defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+ defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+ defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize;
+ defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W;
+
+ // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+ let AddedComplexity = 10 in {
+ def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+ }
+
+ def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+ // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+ // immedidate shift, i.e. the following code is considered better
+ //
+ // mov %edi, %esi
+ // shl $imm, %esi
+ // ... %edi, ...
+ //
+ // than
+ //
+ // movb $imm, %sil
+ // shlx %sil, %edi, %esi
+ // ... %edi, ...
+ //
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, GR8:$src2),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, GR8:$src2),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, GR8:$src2),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, GR8:$src2),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, GR8:$src2),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, GR8:$src2),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+ //
+ // mov (%ecx), %esi
+ // shl $imm, $esi
+ //
+ // over
+ //
+ // movb $imm %al
+ // shlx %al, (%ecx), %esi
+ //
+ // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+ // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 0000000..bab3cdd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,529 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes. These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+ def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+ TB;
+
+let Defs = [RAX, RCX, RDX] in
+ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+
+// CPU flow control instructions
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+ def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+ def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+ def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+ [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+ (INT3)>;
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
+ [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+ Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
+ Requires<[In64BitMode]>;
+
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr : I<0xEC, RawFrm, (outs), (ins),
+ "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+ "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+ "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
+
+let Defs = [AL] in
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
+ "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+ "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+ "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
+
+let Uses = [DX, AL] in
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
+ "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
+
+let Uses = [AL] in
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
+ "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+ "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+ "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
+
+def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize;
+def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+ [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+ "str{w}\t$dst", [], IIC_STR>, TB, OpSize;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+ "str{l}\t$dst", [], IIC_STR>, TB;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+ "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+ "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
+
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+ "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+ "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+ "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+ "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+ "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+ "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+ "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+ "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+ "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+ "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+ "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+ "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+ "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+ "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+ "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ OpSize, Requires<[In32BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+ "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ Requires<[In32BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+ "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+ OpSize, Requires<[In32BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+ "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+ Requires<[In32BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+ "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+ OpSize, Requires<[In32BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+ "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+ Requires<[In32BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
+
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
+
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
+
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+ "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+ "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+ "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+ "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+ "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
+def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+ "sgdt\t$dst", [], IIC_SGDT>, TB;
+def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+ "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
+def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+ "sidt\t$dst", []>, TB;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+ "sldt{l}\t$dst", [], IIC_SLDT>, TB;
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+// extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
+def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt\t$src", [], IIC_LGDT>, TB;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
+def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt\t$src", [], IIC_LIDT>, TB;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+ "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+ "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+ "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+ "smsw{l}\t$dst", [], IIC_SMSW>, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+ "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+ "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+ "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+ "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Defs = [RDX, RAX], Uses = [RCX] in
+ def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [RDX, RAX, RCX] in
+ def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+
+let Uses = [RDX, RAX] in {
+ def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+ "xsave\t$dst", []>, TB;
+ def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+ "xsave{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+ def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor\t$dst", []>, TB;
+ def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+ def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+ "xsaveopt\t$dst", []>, TB;
+ def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+ "xsaveopt{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+ def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+ def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7;
+ def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7;
+ def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7;
+ def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7;
+ def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+ def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6;
+ def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+ def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+ def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+ "rdfsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS;
+ def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+ "rdfsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS;
+ def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+ "rdgsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS;
+ def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+ "rdgsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS;
+ def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+ "wrfsbase{l}\t$src",
+ [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS;
+ def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+ "wrfsbase{q}\t$src",
+ [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS;
+ def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+ "wrgsbase{l}\t$src",
+ [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS;
+ def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+ "wrgsbase{q}\t$src",
+ [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ Requires<[In32BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS], Uses = [EFLAGS] in {
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 0000000..363a190
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,39 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+ "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+ Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
+ "xbegin\t$dst", []>, Requires<[HasRTM]>;
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+ "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+ "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+ "xabort\t$imm",
+ [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 0000000..6d3548f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ Requires<[In32BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ Requires<[In32BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmclear\t$vmcs", []>, OpSize, TB;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmptrld\t$vmcs", []>, TB;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+ "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+ "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 0000000..2aa08fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,311 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+ defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+ defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+ defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+ defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+ defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+ defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+ defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+ defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+ defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+ defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+ defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+ defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+ defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+ defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ Operand memop, ComplexPattern mem_cpat> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+ ssmem, sse_load_f32>;
+ defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+ sdmem, sse_load_f64>;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+ defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
+ def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
+ memopv8f32>;
+ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
+ memopv4f64>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
+ VEX_4V, VEX_W;
+ def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
+ VEX_4VOp3;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+ defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+ defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+ defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+ defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+ defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+ defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+ defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
+ def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ VR128:$src3))]>, VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+ defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+ defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+ defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+ defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+ defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+ defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+ defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+ defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+ defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+ defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+ defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ VEX_4V;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ imm:$src3))]>, VEX_4V;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+ defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+ defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+ defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+ defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+ defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+ defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+ defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
+ VEX_4V, VEX_I8IMM;
+ def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2,
+ (bitconvert (memopv2i64 addr:$src3))))]>,
+ VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
+ def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ VR128:$src3))]>,
+ VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+ defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+}
+
+multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
+ VEX_4V, VEX_I8IMM, VEX_L;
+ def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int VR256:$src1, VR256:$src2,
+ (bitconvert (memopv4i64 addr:$src3))))]>,
+ VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+ def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
+ VR256:$src3))]>,
+ VEX_4V, VEX_I8IMM, VEX_L;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
+ Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+ def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+ def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+ VEX_W, MemOp4;
+ def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+ def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+ def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
+ VEX_W, MemOp4, VEX_L;
+ def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+ VEX_L;
+}
+
+defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+ int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>;
+defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+ int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..44d8cce
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,581 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Valgrind.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+ unsigned char *OldByte = (unsigned char *)Old;
+ *OldByte++ = 0xE9; // Emit JMP opcode.
+ unsigned *OldWord = (unsigned *)OldByte;
+ unsigned NewAddr = (intptr_t)New;
+ unsigned OldAddr = (intptr_t)OldWord;
+ *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+
+ // X86 doesn't need to invalidate the processor cache, so just invalidate
+ // Valgrind's cache directly.
+ sys::ValgrindDiscardTranslations(Old, 5);
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host. This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+// need to add some workarounds for compilers, which support
+// only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+ // No need to save EAX/EDX for X86-64.
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ TYPE_FUNCTION(X86CompilationCallback)
+ ASMPREFIX "X86CompilationCallback:\n"
+ CFI(".cfi_startproc\n")
+ // Save RBP
+ "pushq %rbp\n"
+ CFI(".cfi_def_cfa_offset 16\n")
+ CFI(".cfi_offset %rbp, -16\n")
+ // Save RSP
+ "movq %rsp, %rbp\n"
+ CFI(".cfi_def_cfa_register %rbp\n")
+ // Save all int arg registers
+ "pushq %rdi\n"
+ CFI(".cfi_rel_offset %rdi, 0\n")
+ "pushq %rsi\n"
+ CFI(".cfi_rel_offset %rsi, 8\n")
+ "pushq %rdx\n"
+ CFI(".cfi_rel_offset %rdx, 16\n")
+ "pushq %rcx\n"
+ CFI(".cfi_rel_offset %rcx, 24\n")
+ "pushq %r8\n"
+ CFI(".cfi_rel_offset %r8, 32\n")
+ "pushq %r9\n"
+ CFI(".cfi_rel_offset %r9, 40\n")
+ // Align stack on 16-byte boundary. ESP might not be properly aligned
+ // (8 byte) if this is called from an indirect stub.
+ "andq $-16, %rsp\n"
+ // Save all XMM arg registers
+ "subq $128, %rsp\n"
+ "movaps %xmm0, (%rsp)\n"
+ "movaps %xmm1, 16(%rsp)\n"
+ "movaps %xmm2, 32(%rsp)\n"
+ "movaps %xmm3, 48(%rsp)\n"
+ "movaps %xmm4, 64(%rsp)\n"
+ "movaps %xmm5, 80(%rsp)\n"
+ "movaps %xmm6, 96(%rsp)\n"
+ "movaps %xmm7, 112(%rsp)\n"
+ // JIT callee
+#ifdef _WIN64
+ "subq $32, %rsp\n"
+ "movq %rbp, %rcx\n" // Pass prev frame and return address
+ "movq 8(%rbp), %rdx\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
+ "addq $32, %rsp\n"
+#else
+ "movq %rbp, %rdi\n" // Pass prev frame and return address
+ "movq 8(%rbp), %rsi\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
+#endif
+ // Restore all XMM arg registers
+ "movaps 112(%rsp), %xmm7\n"
+ "movaps 96(%rsp), %xmm6\n"
+ "movaps 80(%rsp), %xmm5\n"
+ "movaps 64(%rsp), %xmm4\n"
+ "movaps 48(%rsp), %xmm3\n"
+ "movaps 32(%rsp), %xmm2\n"
+ "movaps 16(%rsp), %xmm1\n"
+ "movaps (%rsp), %xmm0\n"
+ // Restore RSP
+ "movq %rbp, %rsp\n"
+ CFI(".cfi_def_cfa_register %rsp\n")
+ // Restore all int arg registers
+ "subq $48, %rsp\n"
+ CFI(".cfi_adjust_cfa_offset 48\n")
+ "popq %r9\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %r9\n")
+ "popq %r8\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %r8\n")
+ "popq %rcx\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rcx\n")
+ "popq %rdx\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rdx\n")
+ "popq %rsi\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rsi\n")
+ "popq %rdi\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rdi\n")
+ // Restore RBP
+ "popq %rbp\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rbp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback)
+ );
+# else
+ // No inline assembler support on this platform. The routine is in external
+ // file.
+ void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ TYPE_FUNCTION(X86CompilationCallback)
+ ASMPREFIX "X86CompilationCallback:\n"
+ CFI(".cfi_startproc\n")
+ "pushl %ebp\n"
+ CFI(".cfi_def_cfa_offset 8\n")
+ CFI(".cfi_offset %ebp, -8\n")
+ "movl %esp, %ebp\n" // Standard prologue
+ CFI(".cfi_def_cfa_register %ebp\n")
+ "pushl %eax\n"
+ CFI(".cfi_rel_offset %eax, 0\n")
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ CFI(".cfi_rel_offset %edx, 4\n")
+ "pushl %ecx\n"
+ CFI(".cfi_rel_offset %ecx, 8\n")
+# if defined(__APPLE__)
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+# endif
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
+ "movl %ebp, %esp\n" // Restore ESP
+ CFI(".cfi_def_cfa_register %esp\n")
+ "subl $12, %esp\n"
+ CFI(".cfi_adjust_cfa_offset 12\n")
+ "popl %ecx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ecx\n")
+ "popl %edx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %edx\n")
+ "popl %eax\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %eax\n")
+ "popl %ebp\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ebp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback)
+ );
+
+ // Same as X86CompilationCallback but also saves XMM argument registers.
+ void X86CompilationCallback_SSE(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+ TYPE_FUNCTION(X86CompilationCallback_SSE)
+ ASMPREFIX "X86CompilationCallback_SSE:\n"
+ CFI(".cfi_startproc\n")
+ "pushl %ebp\n"
+ CFI(".cfi_def_cfa_offset 8\n")
+ CFI(".cfi_offset %ebp, -8\n")
+ "movl %esp, %ebp\n" // Standard prologue
+ CFI(".cfi_def_cfa_register %ebp\n")
+ "pushl %eax\n"
+ CFI(".cfi_rel_offset %eax, 0\n")
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ CFI(".cfi_rel_offset %edx, 4\n")
+ "pushl %ecx\n"
+ CFI(".cfi_rel_offset %ecx, 8\n")
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+ // Save all XMM arg registers
+ "subl $64, %esp\n"
+ // FIXME: provide frame move information for xmm registers.
+ // This can be tricky, because CFA register is ebp (unaligned)
+ // and we need to produce offsets relative to it.
+ "movaps %xmm0, (%esp)\n"
+ "movaps %xmm1, 16(%esp)\n"
+ "movaps %xmm2, 32(%esp)\n"
+ "movaps %xmm3, 48(%esp)\n"
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
+ "addl $16, %esp\n"
+ "movaps 48(%esp), %xmm3\n"
+ CFI(".cfi_restore %xmm3\n")
+ "movaps 32(%esp), %xmm2\n"
+ CFI(".cfi_restore %xmm2\n")
+ "movaps 16(%esp), %xmm1\n"
+ CFI(".cfi_restore %xmm1\n")
+ "movaps (%esp), %xmm0\n"
+ CFI(".cfi_restore %xmm0\n")
+ "movl %ebp, %esp\n" // Restore ESP
+ CFI(".cfi_def_cfa_register esp\n")
+ "subl $12, %esp\n"
+ CFI(".cfi_adjust_cfa_offset 12\n")
+ "popl %ecx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ecx\n")
+ "popl %edx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %edx\n")
+ "popl %eax\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %eax\n")
+ "popl %ebp\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ebp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback_SSE)
+ );
+# else
+ void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+ _declspec(naked) void X86CompilationCallback(void) {
+ __asm {
+ push ebp
+ mov ebp, esp
+ push eax
+ push edx
+ push ecx
+ and esp, -16
+ sub esp, 16
+ mov eax, dword ptr [ebp+4]
+ mov dword ptr [esp+4], eax
+ mov dword ptr [esp], ebp
+ call LLVMX86CompilationCallback2
+ mov esp, ebp
+ sub esp, 12
+ pop ecx
+ pop edx
+ pop eax
+ pop ebp
+ ret
+ }
+ }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+ void X86CompilationCallback() {
+ llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
+ }
+#endif
+}
+
+/// This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call. This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" {
+LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
+ intptr_t RetAddr) {
+ intptr_t *RetAddrLoc = &StackPtr[1];
+ // We are reading raw stack data here. Tell MemorySanitizer that it is
+ // sufficiently initialized.
+ __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc));
+ assert(*RetAddrLoc == RetAddr &&
+ "Could not find return address on the stack!");
+
+ // It's a stub if there is an interrupt marker after the call.
+ bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
+
+ // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+ RetAddr--; // Backtrack to the reference itself...
+#else
+ RetAddr -= 4; // Backtrack to the reference itself...
+#endif
+
+#if 0
+ DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
+ << " ESP=" << (void*)StackPtr
+ << ": Resolving call to function: "
+ << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
+#endif
+
+ // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+ assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+ assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+ assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+ intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+ // Rewrite the call target... so that we don't end up here every time we
+ // execute the call.
+#if defined (X86_64_JIT)
+ assert(isStub &&
+ "X86-64 doesn't support rewriting non-stub lazy compilation calls:"
+ " the call instruction varies too much.");
+#else
+ *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+ if (isStub) {
+ // If this is a stub, rewrite the call into an unconditional branch
+ // instruction so that two return addresses are not pushed onto the stack
+ // when the requested function finally gets called. This also makes the
+ // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+ // If the target address is within 32-bit range of the stub, use a
+ // PC-relative branch instead of loading the actual address. (This is
+ // considerably shorter than the 64-bit immediate load already there.)
+ // We assume here intptr_t is 64 bits.
+ intptr_t diff = NewVal-RetAddr+7;
+ if (diff >= -2147483648LL && diff <= 2147483647LL) {
+ *(unsigned char*)(RetAddr-0xc) = 0xE9;
+ *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+ } else {
+ *(intptr_t *)(RetAddr - 0xa) = NewVal;
+ ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+ }
+ sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd);
+#else
+ ((unsigned char*)RetAddr)[-1] = 0xE9;
+ sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5);
+#endif
+ }
+
+ // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+ *RetAddrLoc -= 0xd;
+#else
+ *RetAddrLoc -= 5;
+#endif
+}
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+ TsanIgnoreWritesBegin();
+ JITCompilerFunction = F;
+ TsanIgnoreWritesEnd();
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+ if (Subtarget->hasSSE1())
+ return X86CompilationCallback_SSE;
+#endif
+
+ return X86CompilationCallback;
+}
+
+X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ useGOT = 0;
+ TLSOffset = 0;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+ JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+ const unsigned Alignment = 8;
+ uint8_t Buffer[8];
+ uint8_t *Cur = Buffer;
+ MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr);
+ MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32));
+#else
+ const unsigned Alignment = 4;
+ uint8_t Buffer[4];
+ uint8_t *Cur = Buffer;
+ MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr);
+#endif
+ return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment);
+}
+
+TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
+ // The 64-bit stub contains:
+ // movabs r10 <- 8-byte-target-address # 10 bytes
+ // call|jmp *r10 # 3 bytes
+ // The 32-bit stub contains a 5-byte call|jmp.
+ // If the stub is a call to the compilation callback, an extra byte is added
+ // to mark it as a stub.
+ StubLayout Result = {14, 4};
+ return Result;
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
+ JITCodeEmitter &JCE) {
+ // Note, we cast to intptr_t here to silence a -pedantic warning that
+ // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+ bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback &&
+ Target != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+ bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback;
+#endif
+ JCE.emitAlignment(4);
+ void *Result = (void*)JCE.getCurrentPCValue();
+ if (NotCC) {
+#if defined (X86_64_JIT)
+ JCE.emitByte(0x49); // REX prefix
+ JCE.emitByte(0xB8+2); // movabsq r10
+ JCE.emitWordLE((unsigned)(intptr_t)Target);
+ JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+ JCE.emitByte(0x41); // REX prefix
+ JCE.emitByte(0xFF); // jmpq *r10
+ JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+ JCE.emitByte(0xE9);
+ JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+ return Result;
+ }
+
+#if defined (X86_64_JIT)
+ JCE.emitByte(0x49); // REX prefix
+ JCE.emitByte(0xB8+2); // movabsq r10
+ JCE.emitWordLE((unsigned)(intptr_t)Target);
+ JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+ JCE.emitByte(0x41); // REX prefix
+ JCE.emitByte(0xFF); // callq *r10
+ JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+ JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination...
+
+ JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+
+ // This used to use 0xCD, but that value is used by JITMemoryManager to
+ // initialize the buffer with garbage, which means it may follow a
+ // noreturn function call, confusing LLVMX86CompilationCallback2. PR 4929.
+ JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub!
+ return Result;
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+ return BB - Entry;
+#else
+ return BB - PICBase;
+#endif
+}
+
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
+ T Value;
+ std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
+ sizeof(T));
+ Value += Delta;
+ std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value),
+ sizeof(T));
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase) {
+ for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+ void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+ intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+ switch ((X86::RelocationType)MR->getRelocationType()) {
+ case X86::reloc_pcrel_word: {
+ // PC relative relocation, add the relocated value to the value already in
+ // memory, after we adjust it for where the PC is.
+ ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+ addUnaligned<unsigned>(RelocPos, ResultPtr);
+ break;
+ }
+ case X86::reloc_picrel_word: {
+ // PIC base relative relocation, add the relocated value to the value
+ // already in memory, after we adjust it for where the PIC base is.
+ ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+ addUnaligned<unsigned>(RelocPos, ResultPtr);
+ break;
+ }
+ case X86::reloc_absolute_word:
+ case X86::reloc_absolute_word_sext:
+ // Absolute relocation, just add the relocated value to the value already
+ // in memory.
+ addUnaligned<unsigned>(RelocPos, ResultPtr);
+ break;
+ case X86::reloc_absolute_dword:
+ addUnaligned<intptr_t>(RelocPos, ResultPtr);
+ break;
+ }
+ }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+ TLSOffset -= size;
+ return TLSOffset;
+#else
+ llvm_unreachable("Cannot allocate thread local storage on this arch!");
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..f916327
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,81 @@
+//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+ class X86TargetMachine;
+ class X86Subtarget;
+
+ class X86JITInfo : public TargetJITInfo {
+ X86TargetMachine &TM;
+ const X86Subtarget *Subtarget;
+ uintptr_t PICBase;
+ char* TLSOffset;
+ public:
+ explicit X86JITInfo(X86TargetMachine &tm);
+
+ /// replaceMachineCodeForFunction - Make it so that calling the function
+ /// whose machine code is at OLD turns into a call to NEW, perhaps by
+ /// overwriting OLD with a branch to NEW. This is used for self-modifying
+ /// code.
+ ///
+ virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+ /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+ /// to emit an indirect symbol which contains the address of the specified
+ /// ptr.
+ virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+ JITCodeEmitter &JCE);
+
+ // getStubLayout - Returns the size and alignment of the largest call stub
+ // on X86.
+ virtual StubLayout getStubLayout();
+
+ /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+ /// small native function that simply calls the function at the specified
+ /// address.
+ virtual void *emitFunctionStub(const Function* F, void *Target,
+ JITCodeEmitter &JCE);
+
+ /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+ /// specific basic block.
+ virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+
+ /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+ virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+ /// relocate - Before the JIT can run a block of code that has been emitted,
+ /// it must rewrite the code to contain the actual addresses of any
+ /// referenced global symbols.
+ virtual void relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase);
+
+ /// allocateThreadLocalMemory - Each target has its own way of
+ /// handling thread local variables. This method returns a value only
+ /// meaningful to the target.
+ virtual char* allocateThreadLocalMemory(size_t size);
+
+ /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+ /// PIC jumptable entry.
+ void setPICBase(uintptr_t Base) { PICBase = Base; }
+ uintptr_t getPICBase() const { return PICBase; }
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 0000000..a8a9fd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,794 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+ MCContext &Ctx;
+ Mangler *Mang;
+ const MachineFunction &MF;
+ const TargetMachine &TM;
+ const MCAsmInfo &MAI;
+ X86AsmPrinter &AsmPrinter;
+public:
+ X86MCInstLower(Mangler *mang, const MachineFunction &MF,
+ X86AsmPrinter &asmprinter);
+
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+ MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
+X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
+ X86AsmPrinter &asmprinter)
+: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
+ MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+ return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+ assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+ SmallString<128> Name;
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ bool isImplicitlyPrivate = false;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+ isImplicitlyPrivate = true;
+
+ Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+ } else if (MO.isSymbol()) {
+ Name += MAI.getGlobalPrefix();
+ Name += MO.getSymbolName();
+ } else if (MO.isMBB()) {
+ Name += MO.getMBB()->getSymbol()->getName();
+ }
+
+ // If the target flags on the operand changes the name of the symbol, do that
+ // before we return the symbol.
+ switch (MO.getTargetFlags()) {
+ default: break;
+ case X86II::MO_DLLIMPORT: {
+ // Handle dllimport linkage.
+ const char *Prefix = "__imp_";
+ Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
+ break;
+ }
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+ Name += "$non_lazy_ptr";
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ return Sym;
+ }
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+ Name += "$non_lazy_ptr";
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getHiddenGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ return Sym;
+ }
+ case X86II::MO_DARWIN_STUB: {
+ Name += "$stub";
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getFnStubEntry(Sym);
+ if (StubSym.getPointer())
+ return Sym;
+
+ if (MO.isGlobal()) {
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ } else {
+ Name.erase(Name.end()-5, Name.end());
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
+ }
+ return Sym;
+ }
+ }
+
+ return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ const MCExpr *Expr = 0;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ // These affect the name of the symbol, not any suffix.
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_DARWIN_STUB:
+ break;
+
+ case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::CreateSub(Expr,
+ MCSymbolRefExpr::Create(MF.getPICBaseSymbol(),
+ Ctx),
+ Ctx);
+ break;
+ case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
+ case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+ case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+ case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+ case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+ case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+ case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+ case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+ case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+ case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+ case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+ case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
+ case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+ case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::CreateSub(Expr,
+ MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx),
+ Ctx);
+ if (MO.isJTI() && MAI.hasSetDirective()) {
+ // If .set directive is supported, use it to reduce the number of
+ // relocations the assembler will generate for differences between
+ // local labels. This is only safe when the symbols are in the same
+ // section so we are restricting it to jumptable references.
+ MCSymbol *Label = Ctx.CreateTempSymbol();
+ AsmPrinter.OutStreamer.EmitAssignment(Label, Expr);
+ Expr = MCSymbolRefExpr::Create(Label, Ctx);
+ }
+ break;
+ }
+
+ if (Expr == 0)
+ Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+ Expr = MCBinaryExpr::CreateAdd(Expr,
+ MCConstantExpr::Create(MO.getOffset(), Ctx),
+ Ctx);
+ return MCOperand::CreateExpr(Expr);
+}
+
+
+
+static void lower_subreg32(MCInst *MI, unsigned OpNo) {
+ // Convert registers in the addr mode according to subreg32.
+ unsigned Reg = MI->getOperand(OpNo).getReg();
+ if (Reg != 0)
+ MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
+}
+
+static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
+ // Convert registers in the addr mode according to subreg64.
+ for (unsigned i = 0; i != 4; ++i) {
+ if (!MI->getOperand(OpNo+i).isReg()) continue;
+
+ unsigned Reg = MI->getOperand(OpNo+i).getReg();
+ // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
+ if (Reg == 0 || Reg == X86::RIP) continue;
+
+ MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
+ }
+}
+
+/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
+static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
+ OutMI.setOpcode(NewOpc);
+ lower_subreg32(&OutMI, 0);
+}
+/// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R
+static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
+ OutMI.setOpcode(NewOpc);
+ OutMI.addOperand(OutMI.getOperand(0));
+ OutMI.addOperand(OutMI.getOperand(0));
+}
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+ unsigned ImmOp = Inst.getNumOperands() - 1;
+ assert(Inst.getOperand(0).isReg() &&
+ (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+ ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+ Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(0).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(ImmOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+ unsigned Opcode) {
+ // Don't make these simplifications in 64-bit mode; other assemblers don't
+ // perform them because they make the code larger.
+ if (Printer.getSubtarget().is64Bit())
+ return;
+
+ bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+ unsigned AddrBase = IsStore;
+ unsigned RegOp = IsStore ? 0 : 5;
+ unsigned AddrOp = AddrBase + 3;
+ assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+ Inst.getOperand(AddrBase + 0).isReg() && // base
+ Inst.getOperand(AddrBase + 1).isImm() && // scale
+ Inst.getOperand(AddrBase + 2).isReg() && // index register
+ (Inst.getOperand(AddrOp).isExpr() || // address
+ Inst.getOperand(AddrOp).isImm())&&
+ Inst.getOperand(AddrBase + 4).isReg() && // segment
+ "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(RegOp).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // Check whether this is an absolute address.
+ // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+ // to do this here.
+ bool Absolute = true;
+ if (Inst.getOperand(AddrOp).isExpr()) {
+ const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+ if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+ Absolute = false;
+ }
+
+ if (Absolute &&
+ (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
+ Inst.getOperand(AddrBase + 2).getReg() != 0 ||
+ Inst.getOperand(AddrBase + 4).getReg() != 0 ||
+ Inst.getOperand(AddrBase + 1).getImm() != 1))
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(AddrOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit()) continue;
+ MCOp = MCOperand::CreateReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::CreateImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = LowerSymbolOperand(MO,
+ AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+ break;
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ continue;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+
+ // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+ switch (OutMI.getOpcode()) {
+ case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
+ lower_lea64_32mem(&OutMI, 1);
+ // FALL THROUGH.
+ case X86::LEA64r:
+ case X86::LEA16r:
+ case X86::LEA32r:
+ // LEA should have a segment register, but it must be empty.
+ assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+ "Unexpected # of LEA operands");
+ assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+ "LEA has segment specified!");
+ break;
+ case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
+ case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
+ case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
+ case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
+ case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
+ case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
+ case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
+ case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
+ case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
+
+ case X86::MOV16r0:
+ LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0
+ LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+ break;
+ case X86::MOV64r0:
+ LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0
+ LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+ break;
+
+ // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+ // if one of the registers is extended, but other isn't.
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+
+ // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+ // inputs modeled as normal uses instead of implicit uses. As such, truncate
+ // off all but the first operand (the callee). FIXME: Change isel.
+ case X86::TAILJMPr64:
+ case X86::CALL64r:
+ case X86::CALL64pcrel32: {
+ unsigned Opcode = OutMI.getOpcode();
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ OutMI = MCInst();
+ OutMI.setOpcode(X86::RET);
+ break;
+ }
+
+ // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+ case X86::TAILJMPr:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64: {
+ unsigned Opcode;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::TAILJMPr: Opcode = X86::JMP32r; break;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
+ }
+
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ // These are pseudo-ops for OR to help with the OR->ADD transformation. We do
+ // this with an ugly goto in case the resultant OR uses EAX and needs the
+ // short form.
+ case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+ case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+ case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+ case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+ case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+ case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+ case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+ case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+ case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+ // The assembler backend wants to see branches in their small form and relax
+ // them to their large form. The JIT can only handle the large form because
+ // it does not do relaxation. For now, translate the large form to the
+ // small one here.
+ case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break;
+ case X86::JO_4: OutMI.setOpcode(X86::JO_1); break;
+ case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break;
+ case X86::JB_4: OutMI.setOpcode(X86::JB_1); break;
+ case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break;
+ case X86::JE_4: OutMI.setOpcode(X86::JE_1); break;
+ case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break;
+ case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break;
+ case X86::JA_4: OutMI.setOpcode(X86::JA_1); break;
+ case X86::JS_4: OutMI.setOpcode(X86::JS_1); break;
+ case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break;
+ case X86::JP_4: OutMI.setOpcode(X86::JP_1); break;
+ case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break;
+ case X86::JL_4: OutMI.setOpcode(X86::JL_1); break;
+ case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break;
+ case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
+ case X86::JG_4: OutMI.setOpcode(X86::JG_1); break;
+
+ // Atomic load and store require a separate pseudo-inst because Acquire
+ // implies mayStore and Release implies mayLoad; fix these to regular MOV
+ // instructions here
+ case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+ case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+ case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+ case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+ case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+ case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+
+ // We don't currently select the correct instruction form for instructions
+ // which have a short %eax, etc. form. Handle this by custom lowering, for
+ // now.
+ //
+ // Note, we are currently not handling the following instructions:
+ // MOV64ao8, MOV64o8a
+ // XCHG16ar, XCHG32ar, XCHG64ar
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break;
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break;
+ case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break;
+ case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break;
+ case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
+ case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+
+ case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break;
+ case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break;
+ case X86::ADC32ri: SimplifyShortImmForm(OutMI, X86::ADC32i32); break;
+ case X86::ADC64ri32: SimplifyShortImmForm(OutMI, X86::ADC64i32); break;
+ case X86::ADD8ri: SimplifyShortImmForm(OutMI, X86::ADD8i8); break;
+ case X86::ADD16ri: SimplifyShortImmForm(OutMI, X86::ADD16i16); break;
+ case X86::ADD32ri: SimplifyShortImmForm(OutMI, X86::ADD32i32); break;
+ case X86::ADD64ri32: SimplifyShortImmForm(OutMI, X86::ADD64i32); break;
+ case X86::AND8ri: SimplifyShortImmForm(OutMI, X86::AND8i8); break;
+ case X86::AND16ri: SimplifyShortImmForm(OutMI, X86::AND16i16); break;
+ case X86::AND32ri: SimplifyShortImmForm(OutMI, X86::AND32i32); break;
+ case X86::AND64ri32: SimplifyShortImmForm(OutMI, X86::AND64i32); break;
+ case X86::CMP8ri: SimplifyShortImmForm(OutMI, X86::CMP8i8); break;
+ case X86::CMP16ri: SimplifyShortImmForm(OutMI, X86::CMP16i16); break;
+ case X86::CMP32ri: SimplifyShortImmForm(OutMI, X86::CMP32i32); break;
+ case X86::CMP64ri32: SimplifyShortImmForm(OutMI, X86::CMP64i32); break;
+ case X86::OR8ri: SimplifyShortImmForm(OutMI, X86::OR8i8); break;
+ case X86::OR16ri: SimplifyShortImmForm(OutMI, X86::OR16i16); break;
+ case X86::OR32ri: SimplifyShortImmForm(OutMI, X86::OR32i32); break;
+ case X86::OR64ri32: SimplifyShortImmForm(OutMI, X86::OR64i32); break;
+ case X86::SBB8ri: SimplifyShortImmForm(OutMI, X86::SBB8i8); break;
+ case X86::SBB16ri: SimplifyShortImmForm(OutMI, X86::SBB16i16); break;
+ case X86::SBB32ri: SimplifyShortImmForm(OutMI, X86::SBB32i32); break;
+ case X86::SBB64ri32: SimplifyShortImmForm(OutMI, X86::SBB64i32); break;
+ case X86::SUB8ri: SimplifyShortImmForm(OutMI, X86::SUB8i8); break;
+ case X86::SUB16ri: SimplifyShortImmForm(OutMI, X86::SUB16i16); break;
+ case X86::SUB32ri: SimplifyShortImmForm(OutMI, X86::SUB32i32); break;
+ case X86::SUB64ri32: SimplifyShortImmForm(OutMI, X86::SUB64i32); break;
+ case X86::TEST8ri: SimplifyShortImmForm(OutMI, X86::TEST8i8); break;
+ case X86::TEST16ri: SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
+ case X86::TEST32ri: SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
+ case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
+ case X86::XOR8ri: SimplifyShortImmForm(OutMI, X86::XOR8i8); break;
+ case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break;
+ case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break;
+ case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break;
+
+ case X86::MORESTACK_RET:
+ OutMI.setOpcode(X86::RET);
+ break;
+
+ case X86::MORESTACK_RET_RESTORE_R10:
+ OutMI.setOpcode(X86::MOV64rr);
+ OutMI.addOperand(MCOperand::CreateReg(X86::R10));
+ OutMI.addOperand(MCOperand::CreateReg(X86::RAX));
+
+ AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+ break;
+ }
+}
+
+static void LowerTlsAddr(MCStreamer &OutStreamer,
+ X86MCInstLower &MCInstLowering,
+ const MachineInstr &MI) {
+
+ bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
+
+ bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+ MCContext &context = OutStreamer.getContext();
+
+ if (needsPadding)
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+
+ MCSymbolRefExpr::VariantKind SRVK;
+ switch (MI.getOpcode()) {
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86::TLS_base_addr32:
+ SRVK = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86::TLS_base_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
+ MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+ const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
+
+ MCInst LEA;
+ if (is64Bits) {
+ LEA.setOpcode(X86::LEA64r);
+ LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest
+ LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base
+ LEA.addOperand(MCOperand::CreateImm(1)); // scale
+ LEA.addOperand(MCOperand::CreateReg(0)); // index
+ LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::CreateReg(0)); // seg
+ } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base
+ LEA.addOperand(MCOperand::CreateImm(1)); // scale
+ LEA.addOperand(MCOperand::CreateReg(0)); // index
+ LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::CreateReg(0)); // seg
+ } else {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::CreateReg(0)); // base
+ LEA.addOperand(MCOperand::CreateImm(1)); // scale
+ LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index
+ LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::CreateReg(0)); // seg
+ }
+ OutStreamer.EmitInstruction(LEA);
+
+ if (needsPadding) {
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ }
+
+ StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+ MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name);
+ const MCSymbolRefExpr *tlsRef =
+ MCSymbolRefExpr::Create(tlsGetAddr,
+ MCSymbolRefExpr::VK_PLT,
+ context);
+
+ OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+ : X86::CALLpcrel32)
+ .addExpr(tlsRef));
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ X86MCInstLower MCInstLowering(Mang, *MF, *this);
+ switch (MI->getOpcode()) {
+ case TargetOpcode::DBG_VALUE:
+ if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+ std::string TmpStr;
+ raw_string_ostream OS(TmpStr);
+ PrintDebugValueComment(MI, OS);
+ OutStreamer.EmitRawText(StringRef(OS.str()));
+ }
+ return;
+
+ // Emit nothing here but a comment if we can.
+ case X86::Int_MemBarrier:
+ if (OutStreamer.hasRawTextSupport())
+ OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER"));
+ return;
+
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ // Lower these as normal, but add some comments.
+ unsigned Reg = MI->getOperand(0).getReg();
+ OutStreamer.AddComment(StringRef("eh_return, addr: %") +
+ X86ATTInstPrinter::getRegisterName(Reg));
+ break;
+ }
+ case X86::TAILJMPr:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ // Lower these as normal, but add some comments.
+ OutStreamer.AddComment("TAILCALL");
+ break;
+
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ return LowerTlsAddr(OutStreamer, MCInstLowering, *MI);
+
+ case X86::MOVPC32r: {
+ // This is a pseudo op for a two instruction sequence with a label, which
+ // looks like:
+ // call "L1$pb"
+ // "L1$pb":
+ // popl %esi
+
+ // Emit the call.
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+
+ // Emit the label.
+ OutStreamer.EmitLabel(PICBase);
+
+ // popl $reg
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::POP32r)
+ .addReg(MI->getOperand(0).getReg()));
+ return;
+ }
+
+ case X86::ADD32ri: {
+ // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+ if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ break;
+
+ // Okay, we have something like:
+ // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+ // For this, we want to print something like:
+ // MYGLOBAL + (. - PICBASE)
+ // However, we can't generate a ".", so just emit a new label here and refer
+ // to it.
+ MCSymbol *DotSym = OutContext.CreateTempSymbol();
+ OutStreamer.EmitLabel(DotSym);
+
+ // Now that we have emitted the label, lower the complex operand expression.
+ MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+ const MCExpr *PICBase =
+ MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext);
+ DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+
+ DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
+ DotExpr, OutContext);
+
+ OutStreamer.EmitInstruction(MCInstBuilder(X86::ADD32ri)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(DotExpr));
+ return;
+ }
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ OutStreamer.EmitInstruction(TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 0000000..568dc22
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..78d20ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,145 @@
+//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+ /// to the space used by the return address).
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex;
+
+ /// TailCallReturnAddrDelta - The number of bytes by which return address
+ /// stack slot is moved as the result of tail call optimization.
+ int TailCallReturnAddrDelta;
+
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ unsigned GlobalBaseReg;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+ /// RegSaveFrameIndex - X86-64 vararg func register save area.
+ int RegSaveFrameIndex;
+ /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+ unsigned VarArgsGPOffset;
+ /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+ unsigned VarArgsFPOffset;
+ /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+ /// being passed on the stack.
+ unsigned ArgumentStackSize;
+ /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+ unsigned NumLocalDynamics;
+
+public:
+ X86MachineFunctionInfo() : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0),
+ SRetReturnReg(0),
+ GlobalBaseReg(0),
+ VarArgsFrameIndex(0),
+ RegSaveFrameIndex(0),
+ VarArgsGPOffset(0),
+ VarArgsFPOffset(0),
+ ArgumentStackSize(0),
+ NumLocalDynamics(0) {}
+
+ explicit X86MachineFunctionInfo(MachineFunction &MF)
+ : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0),
+ SRetReturnReg(0),
+ GlobalBaseReg(0),
+ VarArgsFrameIndex(0),
+ RegSaveFrameIndex(0),
+ VarArgsGPOffset(0),
+ VarArgsFPOffset(0),
+ ArgumentStackSize(0),
+ NumLocalDynamics(0) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+ void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+ unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+ void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+ unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+ void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+ unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+ void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 0000000..83e75ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,212 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+ struct VisitedBBInfo {
+ // HasReturn - Whether the BB contains a return instruction
+ bool HasReturn;
+
+ // Cycles - Number of cycles until return if HasReturn is true, otherwise
+ // number of cycles until end of the BB
+ unsigned int Cycles;
+
+ VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+ VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+ : HasReturn(HasReturn), Cycles(Cycles) {}
+ };
+
+ struct PadShortFunc : public MachineFunctionPass {
+ static char ID;
+ PadShortFunc() : MachineFunctionPass(ID)
+ , Threshold(4), TM(0), TII(0) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 Atom pad short functions";
+ }
+
+ private:
+ void findReturns(MachineBasicBlock *MBB,
+ unsigned int Cycles = 0);
+
+ bool cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles);
+
+ void addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd);
+
+ const unsigned int Threshold;
+
+ // ReturnBBs - Maps basic blocks that return to the minimum number of
+ // cycles until the return, starting from the entry block.
+ DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+ // VisitedBBs - Cache of previously visited BBs.
+ DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+ const TargetMachine *TM;
+ const TargetInstrInfo *TII;
+ };
+
+ char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+ return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+ const AttributeSet &FnAttrs = MF.getFunction()->getAttributes();
+ if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::OptimizeForSize) ||
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::MinSize)) {
+ return false;
+ }
+
+ TM = &MF.getTarget();
+ TII = TM->getInstrInfo();
+
+ // Search through basic blocks and mark the ones that have early returns
+ ReturnBBs.clear();
+ VisitedBBs.clear();
+ findReturns(MF.begin());
+
+ bool MadeChange = false;
+
+ MachineBasicBlock *MBB;
+ unsigned int Cycles = 0;
+
+ // Pad the identified basic blocks with NOOPs
+ for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+ I != ReturnBBs.end(); ++I) {
+ MBB = I->first;
+ Cycles = I->second;
+
+ if (Cycles < Threshold) {
+ // BB ends in a return. Skip over any DBG_VALUE instructions
+ // trailing the terminator.
+ assert(MBB->size() > 0 &&
+ "Basic block should contain at least a RET but is empty");
+ MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+ while (ReturnLoc->isDebugValue())
+ --ReturnLoc;
+ assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+ "Basic block does not end with RET");
+
+ addPadding(MBB, ReturnLoc, Threshold - Cycles);
+ NumBBsPadded++;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+ // If this BB has a return, note how many cycles it takes to get there.
+ bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+ if (Cycles >= Threshold)
+ return;
+
+ if (hasReturn) {
+ ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+ return;
+ }
+
+ // Follow branches in BB and look for returns
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+ I != MBB->succ_end(); ++I) {
+ if (*I == MBB)
+ continue;
+ findReturns(*I, Cycles);
+ }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles) {
+ // Return cached result if BB was previously visited
+ DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+ = VisitedBBs.find(MBB);
+ if (it != VisitedBBs.end()) {
+ VisitedBBInfo BBInfo = it->second;
+ Cycles += BBInfo.Cycles;
+ return BBInfo.HasReturn;
+ }
+
+ unsigned int CyclesToEnd = 0;
+
+ for (MachineBasicBlock::iterator MBBI = MBB->begin();
+ MBBI != MBB->end(); ++MBBI) {
+ MachineInstr *MI = MBBI;
+ // Mark basic blocks with a return instruction. Calls to other
+ // functions do not count because the called function will be padded,
+ // if necessary.
+ if (MI->isReturn() && !MI->isCall()) {
+ VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return true;
+ }
+
+ CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
+ }
+
+ VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd) {
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ while (NOOPsToAdd-- > 0) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..16886e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,691 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+using namespace llvm;
+
+cl::opt<bool>
+ForceStackAlign("force-align-stack",
+ cl::desc("Force align the stack to the minimum alignment"
+ " needed for the function."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+ const TargetInstrInfo &tii)
+ : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::RIP : X86::EIP),
+ X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
+ X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
+ (tm.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::RIP : X86::EIP)),
+ TM(tm), TII(tii) {
+ X86_MC::InitLLVM2SEHRegisterMapping(this);
+
+ // Cache some information.
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ Is64Bit = Subtarget->is64Bit();
+ IsWin64 = Subtarget->isTargetWin64();
+
+ if (Is64Bit) {
+ SlotSize = 8;
+ StackPtr = X86::RSP;
+ FramePtr = X86::RBP;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ }
+ // Use a callee-saved register as the base pointer. These registers must
+ // not conflict with any ABI requirements. For example, in 32-bit mode PIC
+ // requires GOT in the EBX register before function calls via PLT GOT pointer.
+ BasePtr = Is64Bit ? X86::RBX : X86::ESI;
+}
+
+/// getCompactUnwindRegNum - This function maps the register to the number for
+/// compact unwind encoding. Return -1 if the register isn't valid.
+int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
+ switch (getLLVMRegNum(RegNum, isEH)) {
+ case X86::EBX: case X86::RBX: return 1;
+ case X86::ECX: case X86::R12: return 2;
+ case X86::EDX: case X86::R13: return 3;
+ case X86::EDI: case X86::R14: return 4;
+ case X86::ESI: case X86::R15: return 5;
+ case X86::EBP: case X86::RBP: return 6;
+ }
+
+ return -1;
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ // Only enable when post-RA scheduling is enabled and this is needed.
+ return TM.getSubtargetImpl()->postRAScheduler();
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+ return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ // It behaves just like the sub_8bit_hi index.
+ if (!Is64Bit && Idx == X86::sub_8bit)
+ Idx = X86::sub_8bit_hi;
+
+ // Forward to TableGen's default version.
+ return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned SubIdx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ if (!Is64Bit && SubIdx == X86::sub_8bit) {
+ A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+ if (!A)
+ return 0;
+ }
+ return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass*
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+ // Don't allow super-classes of GR8_NOREX. This class is only used after
+ // extrating sub_8bit_hi sub-registers. The H sub-registers cannot be copied
+ // to the full GR8 register class in 64-bit mode, so we cannot allow the
+ // reigster class inflation.
+ //
+ // The GR8_NOREX class is always used in a way that won't be constrained to a
+ // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+ // full GR8 class.
+ if (RC == &X86::GR8_NOREXRegClass)
+ return RC;
+
+ const TargetRegisterClass *Super = RC;
+ TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+ do {
+ switch (Super->getID()) {
+ case X86::GR8RegClassID:
+ case X86::GR16RegClassID:
+ case X86::GR32RegClassID:
+ case X86::GR64RegClassID:
+ case X86::FR32RegClassID:
+ case X86::FR64RegClassID:
+ case X86::RFP32RegClassID:
+ case X86::RFP64RegClassID:
+ case X86::RFP80RegClassID:
+ case X86::VR128RegClassID:
+ case X86::VR256RegClassID:
+ // Don't return a super-class that would shrink the spill size.
+ // That can happen with the vector and float classes.
+ if (Super->getSize() == RC->getSize())
+ return Super;
+ }
+ Super = *I++;
+ } while (Super);
+ return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+ const {
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ switch (Kind) {
+ default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+ case 0: // Normal GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64RegClass;
+ return &X86::GR32RegClass;
+ case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOSPRegClass;
+ return &X86::GR32_NOSPRegClass;
+ case 2: // Available for tailcall (not callee-saved GPRs).
+ if (Subtarget.isTargetWin64())
+ return &X86::GR64_TCW64RegClass;
+ else if (Subtarget.is64Bit())
+ return &X86::GR64_TCRegClass;
+
+ const Function *F = MF.getFunction();
+ bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+ if (hasHipeCC)
+ return &X86::GR32RegClass;
+ return &X86::GR32_TCRegClass;
+ }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &X86::CCRRegClass) {
+ if (Is64Bit)
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+ }
+ return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+ unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case X86::GR32RegClassID:
+ return 4 - FPDiff;
+ case X86::GR64RegClassID:
+ return 12 - FPDiff;
+ case X86::VR128RegClassID:
+ return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4;
+ case X86::VR64RegClassID:
+ return 4;
+ }
+}
+
+const uint16_t *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ switch (MF->getFunction()->getCallingConv()) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_SaveList;
+
+ case CallingConv::Intel_OCL_BI: {
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_SaveList;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_SaveList;
+ break;
+ }
+
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_MostRegs_64_SaveList;
+ break;
+
+ default:
+ break;
+ }
+
+ bool CallsEHReturn = MF->getMMI().callsEHReturn();
+ if (Is64Bit) {
+ if (IsWin64)
+ return CSR_Win64_SaveList;
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
+ }
+ if (CallsEHReturn)
+ return CSR_32EHRet_SaveList;
+ return CSR_32_SaveList;
+}
+
+const uint32_t*
+X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+ if (CC == CallingConv::Intel_OCL_BI) {
+ if (IsWin64 && HasAVX)
+ return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+ if (Is64Bit && HasAVX)
+ return CSR_64_Intel_OCL_BI_AVX_RegMask;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_RegMask;
+ }
+ if (CC == CallingConv::GHC || CC == CallingConv::HiPE)
+ return CSR_NoRegs_RegMask;
+ if (!Is64Bit)
+ return CSR_32_RegMask;
+ if (CC == CallingConv::Cold)
+ return CSR_MostRegs_64_RegMask;
+ if (IsWin64)
+ return CSR_Win64_RegMask;
+ return CSR_64_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+ // Set the stack-pointer register and its aliases as reserved.
+ Reserved.set(X86::RSP);
+ for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+ Reserved.set(*I);
+
+ // Set the instruction pointer register and its aliases as reserved.
+ Reserved.set(X86::RIP);
+ for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+ Reserved.set(*I);
+
+ // Set the frame-pointer register and its aliases as reserved if needed.
+ if (TFI->hasFP(MF)) {
+ Reserved.set(X86::RBP);
+ for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+ Reserved.set(*I);
+ }
+
+ // Set the base-pointer register and its aliases as reserved if needed.
+ if (hasBasePointer(MF)) {
+ CallingConv::ID CC = MF.getFunction()->getCallingConv();
+ const uint32_t* RegMask = getCallPreservedMask(CC);
+ if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+ report_fatal_error(
+ "Stack realignment in presence of dynamic allocas is not supported with"
+ "this calling convention.");
+
+ Reserved.set(getBaseRegister());
+ for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+ Reserved.set(*I);
+ }
+
+ // Mark the segment registers as reserved.
+ Reserved.set(X86::CS);
+ Reserved.set(X86::SS);
+ Reserved.set(X86::DS);
+ Reserved.set(X86::ES);
+ Reserved.set(X86::FS);
+ Reserved.set(X86::GS);
+
+ // Mark the floating point stack registers as reserved.
+ Reserved.set(X86::ST0);
+ Reserved.set(X86::ST1);
+ Reserved.set(X86::ST2);
+ Reserved.set(X86::ST3);
+ Reserved.set(X86::ST4);
+ Reserved.set(X86::ST5);
+ Reserved.set(X86::ST6);
+ Reserved.set(X86::ST7);
+
+ // Reserve the registers that only exist in 64-bit mode.
+ if (!Is64Bit) {
+ // These 8-bit registers are part of the x86-64 extension even though their
+ // super-registers are old 32-bits.
+ Reserved.set(X86::SIL);
+ Reserved.set(X86::DIL);
+ Reserved.set(X86::BPL);
+ Reserved.set(X86::SPL);
+
+ for (unsigned n = 0; n != 8; ++n) {
+ // R8, R9, ...
+ static const uint16_t GPR64[] = {
+ X86::R8, X86::R9, X86::R10, X86::R11,
+ X86::R12, X86::R13, X86::R14, X86::R15
+ };
+ for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+
+ // XMM8, XMM9, ...
+ assert(X86::XMM15 == X86::XMM8+7);
+ for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+
+ return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment and there are dynamic allocas, we can't
+ // reference off of the stack pointer, so we reserve a base pointer.
+ //
+ // This is also true if the function contain MS-style inline assembly. We
+ // do this because if any stack changes occur in the inline assembly, e.g.,
+ // "pusha", then any C local variable or C argument references in the
+ // inline assembly will be wrong because the SP is not properly tracked.
+ if ((needsStackRealignment(MF) && MFI->hasVarSizedObjects()) ||
+ MF.hasMSInlineAsm())
+ return true;
+
+ return false;
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ if (!MF.getTarget().Options.RealignStack)
+ return false;
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(FramePtr))
+ return false;
+
+ // If a base pointer is necessary. Check that it isn't too late to reserve
+ // it.
+ if (MFI->hasVarSizedObjects())
+ return MRI->canReserveReg(BasePtr);
+ return true;
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *F = MF.getFunction();
+ unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+ bool requiresRealignment =
+ ((MFI->getMaxAlignment() > StackAlign) ||
+ F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::StackAlignment));
+
+ // If we've requested that we force align the stack do so now.
+ if (ForceStackAlign)
+ return canRealignStack(MF);
+
+ return requiresRealignment && canRealignStack(MF);
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+ unsigned Reg, int &FrameIdx) const {
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+ if (Reg == FramePtr && TFI->hasFP(MF)) {
+ FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
+ return true;
+ }
+ return false;
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ unsigned BasePtr;
+
+ unsigned Opc = MI.getOpcode();
+ bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
+ if (hasBasePointer(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+ else if (needsStackRealignment(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+ else if (AfterFPPop)
+ BasePtr = StackPtr;
+ else
+ BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register with EBP. Add an offset to the offset.
+ MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+ // Now add the frame object offset to the offset from EBP.
+ int FIOffset;
+ if (AfterFPPop) {
+ // Tail call jmp happens after FP is popped.
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+ } else
+ FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+
+ if (MI.getOperand(FIOperandNum+3).isImm()) {
+ // Offset is a 32-bit integer.
+ int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+ int Offset = FIOffset + Imm;
+ assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+ "Requesting 64-bit offset in 32-bit immediate!");
+ MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+ } else {
+ // Offset is symbolic. This is extremely rare.
+ uint64_t Offset = FIOffset +
+ (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+ MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+ }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+ llvm_unreachable("What is the exception register");
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+ llvm_unreachable("What is the exception handler register");
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
+ bool High) {
+ switch (VT) {
+ default: llvm_unreachable("Unexpected VT");
+ case MVT::i8:
+ if (High) {
+ switch (Reg) {
+ default: return getX86SubSuperRegister(Reg, MVT::i64);
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: llvm_unreachable("Unexpected register");
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case MVT::i16:
+ switch (Reg) {
+ default: llvm_unreachable("Unexpected register");
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case MVT::i32:
+ switch (Reg) {
+ default: llvm_unreachable("Unexpected register");
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case MVT::i64:
+ switch (Reg) {
+ default: llvm_unreachable("Unexpected register");
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+}
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..b9d7b8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,143 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+ class Type;
+ class TargetInstrInfo;
+ class X86TargetMachine;
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+ X86TargetMachine &TM;
+ const TargetInstrInfo &TII;
+
+private:
+ /// Is64Bit - Is the target 64-bits.
+ ///
+ bool Is64Bit;
+
+ /// IsWin64 - Is the target on of win64 flavours
+ ///
+ bool IsWin64;
+
+ /// SlotSize - Stack slot size in bytes.
+ ///
+ unsigned SlotSize;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ ///
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ ///
+ unsigned FramePtr;
+
+ /// BasePtr - X86 physical register used as a base ptr in complex stack
+ /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+ /// variable size stack objects.
+ unsigned BasePtr;
+
+public:
+ X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+ // FIXME: This should be tablegen'd like getDwarfRegNum is
+ int getSEHRegNum(unsigned i) const;
+
+ /// getCompactUnwindRegNum - This function maps the register to the number for
+ /// compact unwind encoding. Return -1 if the register isn't valid.
+ int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const;
+
+ /// Code Generation virtual methods...
+ ///
+ virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
+ /// getMatchingSuperRegClass - Return a subclass of the specified register
+ /// class A so that each register in it has a sub-register of the
+ /// specified sub-register index which is in the specified register class B.
+ virtual const TargetRegisterClass *
+ getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B, unsigned Idx) const;
+
+ virtual const TargetRegisterClass *
+ getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const;
+
+ const TargetRegisterClass*
+ getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
+ /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+ /// values.
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+ const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+ const uint32_t *getNoPreservedMask() const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const;
+
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ bool canRealignStack(const MachineFunction &MF) const;
+
+ bool needsStackRealignment(const MachineFunction &MF) const;
+
+ bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+ int &FrameIdx) const;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = NULL) const;
+
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const;
+ unsigned getStackRegister() const { return StackPtr; }
+ unsigned getBaseRegister() const { return BasePtr; }
+ // FIXME: Move to FrameInfok
+ unsigned getSlotSize() const { return SlotSize; }
+
+ // Exception handling queries.
+ unsigned getEHExceptionRegister() const;
+ unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..be6282a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,423 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+ let Namespace = "X86";
+ let HWEncoding = Enc;
+ let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+ def sub_8bit : SubRegIndex;
+ def sub_8bit_hi : SubRegIndex;
+ def sub_16bit : SubRegIndex;
+ def sub_32bit : SubRegIndex;
+ def sub_xmm : SubRegIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others. We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL : X86Reg<"sil", 6>;
+def DIL : X86Reg<"dil", 7>;
+def BPL : X86Reg<"bpl", 5>;
+def SPL : X86Reg<"spl", 4>;
+def R8B : X86Reg<"r8b", 8>;
+def R9B : X86Reg<"r9b", 9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W : X86Reg<"r8w", 8, [R8B]>;
+def R9W : X86Reg<"r9w", 9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D : X86Reg<"r8d", 8, [R8W]>;
+def R9D : X86Reg<"r9d", 9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>;
+def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+} // CostPerUse
+
+// YMM Registers, used by AVX instructions
+let SubRegIndices = [sub_xmm] in {
+def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>;
+def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>;
+def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>;
+def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>;
+def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>;
+def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>;
+def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>;
+def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>;
+def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>;
+def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>;
+def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
+def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
+def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
+def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
+def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
+def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
+}
+
+class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
+ let Aliases = A;
+}
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : STRegister<"st(0)", 0, []>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+
+// Control registers
+def CR0 : X86Reg<"cr0", 0>;
+def CR1 : X86Reg<"cr1", 1>;
+def CR2 : X86Reg<"cr2", 2>;
+def CR3 : X86Reg<"cr3", 3>;
+def CR4 : X86Reg<"cr4", 4>;
+def CR5 : X86Reg<"cr5", 5>;
+def CR6 : X86Reg<"cr6", 6>;
+def CR7 : X86Reg<"cr7", 7>;
+def CR8 : X86Reg<"cr8", 8>;
+def CR9 : X86Reg<"cr9", 9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+ let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+// segment register as one operand. Always contain a 16-bit segment
+// descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+ R8, R9, R11)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+ let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+ (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+ (and GR64_NOREX, GR64_NOSP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+ let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ 256, (sequence "YMM%u", 0, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..0333056
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,52 @@
+//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+ namespace X86 {
+ /// RelocationType - An enum for the x86 relocation codes. Note that
+ /// the terminology here doesn't follow x86 convention - word means
+ /// 32-bit and dword means 64-bit. The relocations will be treated
+ /// by JIT or ObjectCode emitters, this is transparent to the x86 code
+ /// emitter but JIT and ObjectCode will treat them differently
+ enum RelocationType {
+ /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+ /// the value already in memory, after we adjust it for where the PC is.
+ reloc_pcrel_word = 0,
+
+ /// reloc_picrel_word - PIC base relative relocation, add the relocated
+ /// value to the value already in memory, after we adjust it for where the
+ /// PIC base is.
+ reloc_picrel_word = 1,
+
+ /// reloc_absolute_word - absolute relocation, just add the relocated
+ /// value to the value already in memory.
+ reloc_absolute_word = 2,
+
+ /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+ /// value to the value already in memory. In object files, it represents a
+ /// value which must be sign-extended when resolving the relocation.
+ reloc_absolute_word_sext = 3,
+
+ /// reloc_absolute_dword - absolute relocation, just add the relocated
+ /// value to the value already in memory.
+ reloc_absolute_dword = 4
+ };
+ }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 0000000..84c9203
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,126 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MinLatency = 0; // 0 = Out-of-order execution.
+ let LoadLatency = 4;
+ let ILPWindow = 30;
+ let MispredictPenalty = 16;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, 6 and 7 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [HWPort0156]>;
+def : WriteRes<WriteZero, []>;
+
+defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
+defm : HWWriteResPair<WriteShift, HWPort056, 1>;
+defm : HWWriteResPair<WriteJump, HWPort5, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort05, 1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
+defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteShuffle, HWPort15, 1>;
+
+def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 0000000..b36b3ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,122 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+ // instructions per cycle.
+ // FIXME: Identify instructions that aren't a single fused micro-op.
+ let IssueWidth = 4;
+ let MinLatency = 0; // 0 = Out-of-order execution.
+ let LoadLatency = 4;
+ let ILPWindow = 20;
+ let MispredictPenalty = 16;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [SBPort015]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SBWriteResPair<WriteALU, SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
+defm : SBWriteResPair<WriteShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteJump, SBPort5, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU, SBPort15, 1>;
+defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteShuffle, SBPort15, 1>;
+
+def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 0000000..9fbde88
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,573 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+ // The SchedWrite to use when a load is folded into the instruction.
+ SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+ // Register-Memory operation.
+ def Ld : SchedWrite;
+ // Register-Register operation.
+ def NAME : X86FoldableSchedWrite {
+ let Folded = !cast<SchedWrite>(NAME#"Ld");
+ }
+}
+
+// Arithmetic.
+defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
+defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+defm WriteShuffle : X86SchedWritePair; // Vector shuffles and blends.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM : InstrItinClass;
+def IIC_ALU_NONMEM : InstrItinClass;
+def IIC_LEA : InstrItinClass;
+def IIC_LEA_16 : InstrItinClass;
+def IIC_MUL8 : InstrItinClass;
+def IIC_MUL16_MEM : InstrItinClass;
+def IIC_MUL16_REG : InstrItinClass;
+def IIC_MUL32_MEM : InstrItinClass;
+def IIC_MUL32_REG : InstrItinClass;
+def IIC_MUL64 : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8 : InstrItinClass;
+def IIC_IMUL16_MEM : InstrItinClass;
+def IIC_IMUL16_REG : InstrItinClass;
+def IIC_IMUL32_MEM : InstrItinClass;
+def IIC_IMUL32_REG : InstrItinClass;
+def IIC_IMUL64 : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM : InstrItinClass;
+def IIC_IMUL16_RR : InstrItinClass;
+def IIC_IMUL32_RM : InstrItinClass;
+def IIC_IMUL32_RR : InstrItinClass;
+def IIC_IMUL64_RM : InstrItinClass;
+def IIC_IMUL64_RR : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI : InstrItinClass;
+def IIC_IMUL16_RRI : InstrItinClass;
+def IIC_IMUL32_RMI : InstrItinClass;
+def IIC_IMUL32_RRI : InstrItinClass;
+def IIC_IMUL64_RMI : InstrItinClass;
+def IIC_IMUL64_RRI : InstrItinClass;
+// div
+def IIC_DIV8_MEM : InstrItinClass;
+def IIC_DIV8_REG : InstrItinClass;
+def IIC_DIV16 : InstrItinClass;
+def IIC_DIV32 : InstrItinClass;
+def IIC_DIV64 : InstrItinClass;
+// idiv
+def IIC_IDIV8 : InstrItinClass;
+def IIC_IDIV16 : InstrItinClass;
+def IIC_IDIV32 : InstrItinClass;
+def IIC_IDIV64 : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG : InstrItinClass;
+def IIC_UNARY_MEM : InstrItinClass;
+// add/sub/and/or/xor/adc/sbc/cmp/test
+def IIC_BIN_MEM : InstrItinClass;
+def IIC_BIN_NONMEM : InstrItinClass;
+// shift/rotate
+def IIC_SR : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR : InstrItinClass;
+def IIC_SSE_BIT_P_RM : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR : InstrItinClass;
+def IIC_SSE_INTALU_P_RM : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_CMPP_RR : InstrItinClass;
+def IIC_SSE_CMPP_RM : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTP_RR : InstrItinClass;
+def IIC_SSE_SQRTP_RM : InstrItinClass;
+def IIC_SSE_SQRTS_RR : InstrItinClass;
+def IIC_SSE_SQRTS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNR : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BSF : InstrItinClass;
+def IIC_BSR : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analagous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MinLatency=0 indicates that RAW dependencies can be decoded in the
+// same cycle.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// ILPWindow=10 is an arbitrary threshold that approximates cycles of
+// latency hidden by instruction buffers. The actual value is not very
+// important but should be zero for inorder and nonzero for OOO processors.
+//
+// The GenericModel contains no instruciton itineraries.
+def GenericModel : SchedMachineModel {
+ let IssueWidth = 4;
+ let MinLatency = 0;
+ let LoadLatency = 4;
+ let HighLatency = 10;
+ let ILPWindow = 10;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 0000000..cce8f1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,530 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom (Bonnell)
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+// Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+ // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+ // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+ [ Port0, Port1 ],
+ [], [
+ // P0 only
+ // InstrItinData<class, [InstrStage<N, [P0]>] >,
+ // P0 or P1
+ // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+ // P0 and P1
+ // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >,
+ //
+ // Default is 1 cycle, port0 or port1
+ InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+ // mul
+ InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul by al, ax, eax, rax
+ InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg by reg|mem
+ InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg = reg/mem * imm
+ InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+ // idiv
+ InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // div
+ InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // neg/not/inc/dec
+ InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+ // add/sub/and/or/xor/adc/sbc/cmp/test
+ InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+ // shift/rotate
+ InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+ // shift double
+ InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+ // cmov
+ InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // set
+ InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+ // jcc
+ InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+ // jcxz/jecxz/jrcxz
+ InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+ // jmp rel
+ InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+ // jmp indirect
+ InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ // jmp far
+ InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+ // loop/loope/loopne
+ InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+ // call - all but reg/imm
+ InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+ //ret
+ InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+ InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+ //sign extension movs
+ InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+ InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+ //zero extension movs
+ InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+ InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+ // SSE binary operations
+ // arithmetic fp scalar
+ InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // arithmetic fp parallel
+ InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+ // bitwise parallel
+ InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+ // arithmetic int parallel
+ InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+ // multiply int parallel
+ InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+ // shift parallel
+ InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+ // conversions
+ // to/from PD ...
+ InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // to/from PS except to/from PD and PS2PI
+ InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // MMX MOVs
+ InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // other MMX
+ InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+ // conversions
+ // from/to PD
+ InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // from/to PI
+ InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>]>,
+
+ InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+ InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >,
+ InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+ // System instructions
+ InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >,
+ InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+ InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >,
+ InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >,
+ InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+ InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+ // worst case for mov REG_CRx
+ InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+ // LAR
+ InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >,
+ // LSL
+ InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+ // push control register, segment registers
+ InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+ // pop control register, segment registers
+ InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+ // VERR, VERW
+ InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+ // WRMSR, RDMSR
+ InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+ // SMSW, LMSW
+ InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+ InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+ InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+ ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+ let IssueWidth = 2; // Allows 2 instructions per scheduling group.
+ let MinLatency = 1; // InstrStage cycles overrides MinLatency.
+ // OperandCycles may be used for expected latency.
+ let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+ let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+ let ILPWindow = 0; // Always try to hide expected latency.
+
+ let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 0000000..f934fdd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,270 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
+ TargetSelectionDAGInfo(TM),
+ Subtarget(&TM.getSubtarget<X86Subtarget>()),
+ TLI(*TM.getTargetLowering()) {
+}
+
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If not DWORD aligned or size is more than the threshold, call the library.
+ // The libc version is likely to be faster for these cases. It can use the
+ // address value and run time information about the CPU.
+ if ((Align & 3) != 0 ||
+ !ConstantSize ||
+ ConstantSize->getZExtValue() >
+ Subtarget->getMaxInlineSizeThreshold()) {
+ SDValue InFlag(0, 0);
+
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+ if (const char *bzeroEntry = V &&
+ V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+ EVT IntPtr = TLI.getPointerTy();
+ Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ TargetLowering::
+ CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
+ false, false, false, false,
+ 0, CallingConv::C, /*isTailCall=*/false,
+ /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
+ DAG, dl);
+ std::pair<SDValue,SDValue> CallResult =
+ TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memset.
+ return SDValue();
+ }
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ SDValue InFlag(0, 0);
+ EVT AVT;
+ SDValue Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+ unsigned BytesLeft = 0;
+ bool TwoRepStos = false;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getZExtValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = DAG.getIntPtrConstant(SizeVal);
+ break;
+ }
+
+ if (AVT.bitsGT(MVT::i8)) {
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+ BytesLeft = SizeVal % UBytes;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = DAG.getIntPtrConstant(SizeVal);
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+
+ if (TwoRepStos) {
+ InFlag = Chain.getValue(1);
+ Count = Size;
+ EVT CVT = Count.getValueType();
+ SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+ Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+ X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+ } else if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT AddrVT = Dst.getValueType();
+ EVT SizeVT = Size.getValueType();
+
+ Chain = DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, AddrVT)),
+ Src,
+ DAG.getConstant(BytesLeft, SizeVT),
+ Align, isVolatile, DstPtrInfo.getWithOffset(Offset));
+ }
+
+ // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+ return Chain;
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const {
+ // This requires the copy size to be a constant, preferably
+ // within a subtarget-specific limit.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+ return SDValue();
+
+ /// If not DWORD aligned, it is more efficient to call the library. However
+ /// if calling the library is not allowed (AlwaysInline), then soldier on as
+ /// the code generated here is better than the long load-store sequence we
+ /// would otherwise get.
+ if (!AlwaysInline && (Align & 3) != 0)
+ return SDValue();
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256 ||
+ SrcPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // ESI might be used as a base pointer, in that case we can't simply overwrite
+ // the register. Fall back to generic code.
+ const X86RegisterInfo *TRI =
+ static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
+ if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+ TRI->getBaseRegister() == X86::ESI)
+ return SDValue();
+
+ MVT AVT;
+ if (Align & 1)
+ AVT = MVT::i8;
+ else if (Align & 2)
+ AVT = MVT::i16;
+ else if (Align & 4)
+ // DWORD aligned
+ AVT = MVT::i32;
+ else
+ // QWORD aligned
+ AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ unsigned CountVal = SizeVal / UBytes;
+ SDValue Count = DAG.getIntPtrConstant(CountVal);
+ unsigned BytesLeft = SizeVal % UBytes;
+
+ SDValue InFlag(0, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+ X86::ESI,
+ Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
+ array_lengthof(Ops));
+
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ EVT SizeVT = Size.getValueType();
+ Results.push_back(DAG.getMemcpy(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+ DAG.getConstant(Offset, DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+ DAG.getConstant(Offset, SrcVT)),
+ DAG.getConstant(BytesLeft, SizeVT),
+ Align, isVolatile, AlwaysInline,
+ DstPtrInfo.getWithOffset(Offset),
+ SrcPtrInfo.getWithOffset(Offset)));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &Results[0], Results.size());
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 0000000..d1d66fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,56 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SELECTIONDAGINFO_H
+#define X86SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ const X86TargetLowering &TLI;
+
+public:
+ explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
+ ~X86SelectionDAGInfo();
+
+ virtual
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const;
+
+ virtual
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..74da2a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,518 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
+ if (isPICStyleGOT()) // 32-bit ELF targets.
+ return X86II::MO_GOTOFF;
+
+ if (isPICStyleStubPIC()) // Darwin/32 in PIC mode.
+ return X86II::MO_PIC_BASE_OFFSET;
+
+ // Direct static reference to label.
+ return X86II::MO_NO_FLAG;
+}
+
+/// ClassifyGlobalReference - Classify a global variable reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+ // DLLImport only exists on windows, it is implemented as a load from a
+ // DLLIMPORT stub.
+ if (GV->hasDLLImportLinkage())
+ return X86II::MO_DLLIMPORT;
+
+ // Determine whether this is a reference to a definition or a declaration.
+ // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+ // load from stub.
+ bool isDecl = GV->hasAvailableExternallyLinkage();
+ if (GV->isDeclaration() && !GV->isMaterializable())
+ isDecl = true;
+
+ // X86-64 in PIC mode.
+ if (isPICStyleRIPRel()) {
+ // Large model never uses stubs.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return X86II::MO_NO_FLAG;
+
+ if (isTargetDarwin()) {
+ // If symbol visibility is hidden, the extra load is not needed if
+ // target is x86-64 or the symbol is definitely defined in the current
+ // translation unit.
+ if (GV->hasDefaultVisibility() &&
+ (isDecl || GV->isWeakForLinker()))
+ return X86II::MO_GOTPCREL;
+ } else if (!isTargetWin64()) {
+ assert(isTargetELF() && "Unknown rip-relative target");
+
+ // Extra load is needed for all externally visible.
+ if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+ return X86II::MO_GOTPCREL;
+ }
+
+ return X86II::MO_NO_FLAG;
+ }
+
+ if (isPICStyleGOT()) { // 32-bit ELF targets.
+ // Extra load is needed for all externally visible.
+ if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+ return X86II::MO_GOTOFF;
+ return X86II::MO_GOT;
+ }
+
+ if (isPICStyleStubPIC()) { // Darwin/32 in PIC mode.
+ // Determine whether we have a stub reference and/or whether the reference
+ // is relative to the PIC base or not.
+
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (!isDecl && !GV->isWeakForLinker())
+ return X86II::MO_PIC_BASE_OFFSET;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+ // If symbol visibility is hidden, we have a stub for common symbol
+ // references and external declarations.
+ if (isDecl || GV->hasCommonLinkage()) {
+ // Hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+ }
+
+ // Otherwise, no stub.
+ return X86II::MO_PIC_BASE_OFFSET;
+ }
+
+ if (isPICStyleStubNoDynamic()) { // Darwin/32 in -mdynamic-no-pic mode.
+ // Determine whether we have a stub reference.
+
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (!isDecl && !GV->isWeakForLinker())
+ return X86II::MO_NO_FLAG;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_NONLAZY;
+
+ // Otherwise, no stub.
+ return X86II::MO_NO_FLAG;
+ }
+
+ // Direct static reference to global.
+ return X86II::MO_NO_FLAG;
+}
+
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+ // Darwin 10 has a __bzero entry point for this purpose.
+ if (getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 6))
+ return "__bzero";
+
+ return 0;
+}
+
+bool X86Subtarget::hasSinCos() const {
+ return getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+ is64Bit();
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+ if (In64BitMode)
+ return false;
+ return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+static bool OSHasAVXSupport() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+#if defined(__GNUC__)
+ // Check xgetbv; this uses a .byte sequence instead of the instruction
+ // directly because older assemblers do not include support for xgetbv and
+ // there is no easy way to conditionally compile based on the assembler used.
+ int rEAX, rEDX;
+ __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+ unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+ int rEAX = 0; // Ensures we return false
+#endif
+ return (rEAX & 6) == 6;
+#else
+ return false;
+#endif
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ unsigned MaxLevel;
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) ||
+ MaxLevel < 1)
+ return;
+
+ X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+
+ if ((EDX >> 15) & 1) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); }
+ if ((EDX >> 23) & 1) { X86SSELevel = MMX; ToggleFeature(X86::FeatureMMX); }
+ if ((EDX >> 25) & 1) { X86SSELevel = SSE1; ToggleFeature(X86::FeatureSSE1); }
+ if ((EDX >> 26) & 1) { X86SSELevel = SSE2; ToggleFeature(X86::FeatureSSE2); }
+ if (ECX & 0x1) { X86SSELevel = SSE3; ToggleFeature(X86::FeatureSSE3); }
+ if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
+ if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
+ if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
+ if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) {
+ X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX);
+ }
+
+ bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
+ bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
+
+ if ((ECX >> 1) & 0x1) {
+ HasPCLMUL = true;
+ ToggleFeature(X86::FeaturePCLMUL);
+ }
+ if ((ECX >> 12) & 0x1) {
+ HasFMA = true;
+ ToggleFeature(X86::FeatureFMA);
+ }
+ if (IsIntel && ((ECX >> 22) & 0x1)) {
+ HasMOVBE = true;
+ ToggleFeature(X86::FeatureMOVBE);
+ }
+ if ((ECX >> 23) & 0x1) {
+ HasPOPCNT = true;
+ ToggleFeature(X86::FeaturePOPCNT);
+ }
+ if ((ECX >> 25) & 0x1) {
+ HasAES = true;
+ ToggleFeature(X86::FeatureAES);
+ }
+ if ((ECX >> 29) & 0x1) {
+ HasF16C = true;
+ ToggleFeature(X86::FeatureF16C);
+ }
+ if (IsIntel && ((ECX >> 30) & 0x1)) {
+ HasRDRAND = true;
+ ToggleFeature(X86::FeatureRDRAND);
+ }
+
+ if ((ECX >> 13) & 0x1) {
+ HasCmpxchg16b = true;
+ ToggleFeature(X86::FeatureCMPXCHG16B);
+ }
+
+ if (IsIntel || IsAMD) {
+ // Determine if bit test memory instructions are slow.
+ unsigned Family = 0;
+ unsigned Model = 0;
+ X86_MC::DetectFamilyModel(EAX, Family, Model);
+ if (IsAMD || (Family == 6 && Model >= 13)) {
+ IsBTMemSlow = true;
+ ToggleFeature(X86::FeatureSlowBTMem);
+ }
+
+ // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
+ // memory access is fast. We hard code model numbers here because they
+ // aren't strictly increasing for Intel chips it seems.
+ if (IsIntel &&
+ ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield,
+ // Jasper Froest
+ (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP
+ (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX
+ (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale
+ (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP
+ (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
+ (Family == 6 && Model == 0x2A) || // SandyBridge
+ (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
+ (Family == 6 && Model == 0x3A))) {// IvyBridge
+ IsUAMemFast = true;
+ ToggleFeature(X86::FeatureFastUAMem);
+ }
+
+ // Set processor type. Currently only Atom is detected.
+ if (Family == 6 &&
+ (Model == 28 || Model == 38 || Model == 39
+ || Model == 53 || Model == 54)) {
+ X86ProcFamily = IntelAtom;
+
+ UseLeaForSP = true;
+ ToggleFeature(X86::FeatureLeaForSP);
+ }
+
+ unsigned MaxExtLevel;
+ X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
+
+ if (MaxExtLevel >= 0x80000001) {
+ X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ if ((EDX >> 29) & 0x1) {
+ HasX86_64 = true;
+ ToggleFeature(X86::Feature64Bit);
+ }
+ if ((ECX >> 5) & 0x1) {
+ HasLZCNT = true;
+ ToggleFeature(X86::FeatureLZCNT);
+ }
+ if (IsIntel && ((ECX >> 8) & 0x1)) {
+ HasPRFCHW = true;
+ ToggleFeature(X86::FeaturePRFCHW);
+ }
+ if (IsAMD) {
+ if ((ECX >> 6) & 0x1) {
+ HasSSE4A = true;
+ ToggleFeature(X86::FeatureSSE4A);
+ }
+ if ((ECX >> 11) & 0x1) {
+ HasXOP = true;
+ ToggleFeature(X86::FeatureXOP);
+ }
+ if ((ECX >> 16) & 0x1) {
+ HasFMA4 = true;
+ ToggleFeature(X86::FeatureFMA4);
+ }
+ }
+ }
+ }
+
+ if (MaxLevel >= 7) {
+ if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
+ if (IsIntel && (EBX & 0x1)) {
+ HasFSGSBase = true;
+ ToggleFeature(X86::FeatureFSGSBase);
+ }
+ if ((EBX >> 3) & 0x1) {
+ HasBMI = true;
+ ToggleFeature(X86::FeatureBMI);
+ }
+ if ((EBX >> 4) & 0x1) {
+ HasHLE = true;
+ ToggleFeature(X86::FeatureHLE);
+ }
+ if (IsIntel && ((EBX >> 5) & 0x1)) {
+ X86SSELevel = AVX2;
+ ToggleFeature(X86::FeatureAVX2);
+ }
+ if (IsIntel && ((EBX >> 8) & 0x1)) {
+ HasBMI2 = true;
+ ToggleFeature(X86::FeatureBMI2);
+ }
+ if (IsIntel && ((EBX >> 11) & 0x1)) {
+ HasRTM = true;
+ ToggleFeature(X86::FeatureRTM);
+ }
+ if (IsIntel && ((EBX >> 19) & 0x1)) {
+ HasADX = true;
+ ToggleFeature(X86::FeatureADX);
+ }
+ if (IsIntel && ((EBX >> 18) & 0x1)) {
+ HasRDSEED = true;
+ ToggleFeature(X86::FeatureRDSEED);
+ }
+ }
+ }
+}
+
+void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+ AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+ Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+ "target-cpu");
+ Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+ "target-features");
+ std::string CPU =
+ !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+ std::string FS =
+ !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+ if (!FS.empty()) {
+ initializeEnvironment();
+ resetSubtargetFeatures(CPU, FS);
+ }
+}
+
+void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+ std::string CPUName = CPU;
+ if (!FS.empty() || !CPU.empty()) {
+ if (CPUName.empty()) {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+ CPUName = sys::getHostCPUName();
+#else
+ CPUName = "generic";
+#endif
+ }
+
+ // Make sure 64-bit features are available in 64-bit mode. (But make sure
+ // SSE2 can be turned off explicitly.)
+ std::string FullFS = FS;
+ if (In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+64bit,+sse2," + FullFS;
+ else
+ FullFS = "+64bit,+sse2";
+ }
+
+ // If feature string is not empty, parse features string.
+ ParseSubtargetFeatures(CPUName, FullFS);
+ } else {
+ if (CPUName.empty()) {
+#if defined (__x86_64__) || defined(__i386__)
+ CPUName = sys::getHostCPUName();
+#else
+ CPUName = "generic";
+#endif
+ }
+ // Otherwise, use CPUID to auto-detect feature set.
+ AutoDetectSubtargetFeatures();
+
+ // Make sure 64-bit features are available in 64-bit mode.
+ if (In64BitMode) {
+ HasX86_64 = true; ToggleFeature(X86::Feature64Bit);
+ HasCMov = true; ToggleFeature(X86::FeatureCMOV);
+
+ if (X86SSELevel < SSE2) {
+ X86SSELevel = SSE2;
+ ToggleFeature(X86::FeatureSSE1);
+ ToggleFeature(X86::FeatureSSE2);
+ }
+ }
+ }
+
+ // CPUName may have been set by the CPU detection code. Make sure the
+ // new MCSchedModel is used.
+ InitMCProcessorInfo(CPUName, FS);
+
+ if (X86ProcFamily == IntelAtom)
+ PostRAScheduler = true;
+
+ InstrItins = getInstrItineraryForCPU(CPUName);
+
+ // It's important to keep the MCSubtargetInfo feature bits in sync with
+ // target data structure which is shared with MC code emitter, etc.
+ if (In64BitMode)
+ ToggleFeature(X86::Mode64Bit);
+
+ DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel
+ << ", 64bit " << HasX86_64 << "\n");
+ assert((!In64BitMode || HasX86_64) &&
+ "64-bit code requested on a subtarget that doesn't support it!");
+
+ // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
+ // 32 and 64 bit) and for all 64-bit targets.
+ if (StackAlignOverride)
+ stackAlignment = StackAlignOverride;
+ else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+ In64BitMode)
+ stackAlignment = 16;
+}
+
+void X86Subtarget::initializeEnvironment() {
+ X86SSELevel = NoMMXSSE;
+ X863DNowLevel = NoThreeDNow;
+ HasCMov = false;
+ HasX86_64 = false;
+ HasPOPCNT = false;
+ HasSSE4A = false;
+ HasAES = false;
+ HasPCLMUL = false;
+ HasFMA = false;
+ HasFMA4 = false;
+ HasXOP = false;
+ HasMOVBE = false;
+ HasRDRAND = false;
+ HasF16C = false;
+ HasFSGSBase = false;
+ HasLZCNT = false;
+ HasBMI = false;
+ HasBMI2 = false;
+ HasRTM = false;
+ HasHLE = false;
+ HasADX = false;
+ HasPRFCHW = false;
+ HasRDSEED = false;
+ IsBTMemSlow = false;
+ IsUAMemFast = false;
+ HasVectorUAMem = false;
+ HasCmpxchg16b = false;
+ UseLeaForSP = false;
+ HasSlowDivide = false;
+ PostRAScheduler = false;
+ PadShortFunctions = false;
+ CallRegIndirect = false;
+ LEAUsesAG = false;
+ stackAlignment = 4;
+ // FIXME: this is a known good value for Yonah. How about others?
+ MaxInlineSizeThreshold = 128;
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
+ const std::string &FS,
+ unsigned StackAlignOverride, bool is64Bit)
+ : X86GenSubtargetInfo(TT, CPU, FS)
+ , X86ProcFamily(Others)
+ , PICStyle(PICStyles::None)
+ , TargetTriple(TT)
+ , StackAlignOverride(StackAlignOverride)
+ , In64BitMode(is64Bit) {
+ initializeEnvironment();
+ resetSubtargetFeatures(CPU, FS);
+}
+
+bool X86Subtarget::enablePostRAScheduler(
+ CodeGenOpt::Level OptLevel,
+ TargetSubtargetInfo::AntiDepBreakMode& Mode,
+ RegClassVector& CriticalPathRCs) const {
+ Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+ CriticalPathRCs.clear();
+ return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..66832b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,383 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// PICStyles - The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+ StubPIC, // Used on i386-darwin in -fPIC mode.
+ StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+ GOT, // Used on many 32-bit unices in -fPIC mode.
+ RIPRel, // Used on X86-64 when not in -static mode.
+ None // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget : public X86GenSubtargetInfo {
+protected:
+ enum X86SSEEnum {
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, ThreeDNow, ThreeDNowA
+ };
+
+ enum X86ProcFamilyEnum {
+ Others, IntelAtom
+ };
+
+ /// X86ProcFamily - X86 processor family: Intel Atom, and others
+ X86ProcFamilyEnum X86ProcFamily;
+
+ /// PICStyle - Which PIC style to use
+ ///
+ PICStyles::Style PICStyle;
+
+ /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+ /// none supported.
+ X86SSEEnum X86SSELevel;
+
+ /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+ ///
+ X863DNowEnum X863DNowLevel;
+
+ /// HasCMov - True if this processor has conditional move instructions
+ /// (generally pentium pro+).
+ bool HasCMov;
+
+ /// HasX86_64 - True if the processor supports X86-64 instructions.
+ ///
+ bool HasX86_64;
+
+ /// HasPOPCNT - True if the processor supports POPCNT.
+ bool HasPOPCNT;
+
+ /// HasSSE4A - True if the processor supports SSE4A instructions.
+ bool HasSSE4A;
+
+ /// HasAES - Target has AES instructions
+ bool HasAES;
+
+ /// HasPCLMUL - Target has carry-less multiplication
+ bool HasPCLMUL;
+
+ /// HasFMA - Target has 3-operand fused multiply-add
+ bool HasFMA;
+
+ /// HasFMA4 - Target has 4-operand fused multiply-add
+ bool HasFMA4;
+
+ /// HasXOP - Target has XOP instructions
+ bool HasXOP;
+
+ /// HasMOVBE - True if the processor has the MOVBE instruction.
+ bool HasMOVBE;
+
+ /// HasRDRAND - True if the processor has the RDRAND instruction.
+ bool HasRDRAND;
+
+ /// HasF16C - Processor has 16-bit floating point conversion instructions.
+ bool HasF16C;
+
+ /// HasFSGSBase - Processor has FS/GS base insturctions.
+ bool HasFSGSBase;
+
+ /// HasLZCNT - Processor has LZCNT instruction.
+ bool HasLZCNT;
+
+ /// HasBMI - Processor has BMI1 instructions.
+ bool HasBMI;
+
+ /// HasBMI2 - Processor has BMI2 instructions.
+ bool HasBMI2;
+
+ /// HasRTM - Processor has RTM instructions.
+ bool HasRTM;
+
+ /// HasHLE - Processor has HLE.
+ bool HasHLE;
+
+ /// HasADX - Processor has ADX instructions.
+ bool HasADX;
+
+ /// HasPRFCHW - Processor has PRFCHW instructions.
+ bool HasPRFCHW;
+
+ /// HasRDSEED - Processor has RDSEED instructions.
+ bool HasRDSEED;
+
+ /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+ bool IsBTMemSlow;
+
+ /// IsUAMemFast - True if unaligned memory access is fast.
+ bool IsUAMemFast;
+
+ /// HasVectorUAMem - True if SIMD operations can have unaligned memory
+ /// operands. This may require setting a feature bit in the processor.
+ bool HasVectorUAMem;
+
+ /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
+ /// this is true for most x86-64 chips, but not the first AMD chips.
+ bool HasCmpxchg16b;
+
+ /// UseLeaForSP - True if the LEA instruction should be used for adjusting
+ /// the stack pointer. This is an optimization for Intel Atom processors.
+ bool UseLeaForSP;
+
+ /// HasSlowDivide - True if smaller divides are significantly faster than
+ /// full divides and should be used when possible.
+ bool HasSlowDivide;
+
+ /// PostRAScheduler - True if using post-register-allocation scheduler.
+ bool PostRAScheduler;
+
+ /// PadShortFunctions - True if the short functions should be padded to prevent
+ /// a stall when returning too early.
+ bool PadShortFunctions;
+
+ /// CallRegIndirect - True if the Calls with memory reference should be converted
+ /// to a register-based indirect call.
+ bool CallRegIndirect;
+ /// LEAUsesAG - True if the LEA instruction inputs have to be ready at
+ /// address generation (AG) time.
+ bool LEAUsesAG;
+
+ /// stackAlignment - The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment;
+
+ /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ ///
+ unsigned MaxInlineSizeThreshold;
+
+ /// TargetTriple - What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ /// Instruction itineraries for scheduling
+ InstrItineraryData InstrItins;
+
+private:
+ /// StackAlignOverride - Override the stack alignment.
+ unsigned StackAlignOverride;
+
+ /// In64BitMode - True if compiling for 64-bit, false for 32-bit.
+ bool In64BitMode;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ X86Subtarget(const std::string &TT, const std::string &CPU,
+ const std::string &FS,
+ unsigned StackAlignOverride, bool is64Bit);
+
+ /// getStackAlignment - Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+ /// instruction.
+ void AutoDetectSubtargetFeatures();
+
+ /// \brief Reset the features for the X86 target.
+ virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+ void initializeEnvironment();
+ void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+ /// Is this x86_64? (disregarding specific ABI / programming model)
+ bool is64Bit() const {
+ return In64BitMode;
+ }
+
+ /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+ bool isTarget64BitILP32() const {
+ return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32);
+ }
+
+ /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+ bool isTarget64BitLP64() const {
+ return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32);
+ }
+
+ PICStyles::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
+
+ bool hasCMov() const { return HasCMov; }
+ bool hasMMX() const { return X86SSELevel >= MMX; }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool hasSSE41() const { return X86SSELevel >= SSE41; }
+ bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasAVX() const { return X86SSELevel >= AVX; }
+ bool hasAVX2() const { return X86SSELevel >= AVX2; }
+ bool hasFp256() const { return hasAVX(); }
+ bool hasInt256() const { return hasAVX2(); }
+ bool hasSSE4A() const { return HasSSE4A; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+ bool hasPOPCNT() const { return HasPOPCNT; }
+ bool hasAES() const { return HasAES; }
+ bool hasPCLMUL() const { return HasPCLMUL; }
+ bool hasFMA() const { return HasFMA; }
+ // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+ bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+ bool hasXOP() const { return HasXOP; }
+ bool hasMOVBE() const { return HasMOVBE; }
+ bool hasRDRAND() const { return HasRDRAND; }
+ bool hasF16C() const { return HasF16C; }
+ bool hasFSGSBase() const { return HasFSGSBase; }
+ bool hasLZCNT() const { return HasLZCNT; }
+ bool hasBMI() const { return HasBMI; }
+ bool hasBMI2() const { return HasBMI2; }
+ bool hasRTM() const { return HasRTM; }
+ bool hasHLE() const { return HasHLE; }
+ bool hasADX() const { return HasADX; }
+ bool hasPRFCHW() const { return HasPRFCHW; }
+ bool hasRDSEED() const { return HasRDSEED; }
+ bool isBTMemSlow() const { return IsBTMemSlow; }
+ bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+ bool hasVectorUAMem() const { return HasVectorUAMem; }
+ bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+ bool useLeaForSP() const { return UseLeaForSP; }
+ bool hasSlowDivide() const { return HasSlowDivide; }
+ bool padShortFunctions() const { return PadShortFunctions; }
+ bool callRegIndirect() const { return CallRegIndirect; }
+ bool LEAusesAG() const { return LEAUsesAG; }
+
+ bool isAtom() const { return X86ProcFamily == IntelAtom; }
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetFreeBSD() const {
+ return TargetTriple.getOS() == Triple::FreeBSD;
+ }
+ bool isTargetSolaris() const {
+ return TargetTriple.getOS() == Triple::Solaris;
+ }
+ bool isTargetELF() const {
+ return (TargetTriple.getEnvironment() == Triple::ELF ||
+ TargetTriple.isOSBinFormatELF());
+ }
+ bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+ bool isTargetNaCl() const {
+ return TargetTriple.getOS() == Triple::NaCl;
+ }
+ bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+ bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+ bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
+ bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
+ bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
+ bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+ bool isTargetCOFF() const {
+ return (TargetTriple.getEnvironment() != Triple::ELF &&
+ TargetTriple.isOSBinFormatCOFF());
+ }
+ bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
+
+ bool isTargetWin64() const {
+ // FIXME: x86_64-cygwin has not been released yet.
+ return In64BitMode && TargetTriple.isOSWindows();
+ }
+
+ bool isTargetWin32() const {
+ // FIXME: Cygwin is included for isTargetWin64 -- should it be included
+ // here too?
+ return !In64BitMode && (isTargetMingw() || isTargetWindows());
+ }
+
+ bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+ bool isPICStyleStubPIC() const {
+ return PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isPICStyleStubNoDynamic() const {
+ return PICStyle == PICStyles::StubDynamicNoPIC;
+ }
+ bool isPICStyleStubAny() const {
+ return PICStyle == PICStyles::StubDynamicNoPIC ||
+ PICStyle == PICStyles::StubPIC; }
+
+ /// ClassifyGlobalReference - Classify a global variable reference for the
+ /// current subtarget according to how we should reference it in a non-pcrel
+ /// context.
+ unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM)const;
+
+ /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+ /// current subtarget according to how we should reference it in a non-pcrel
+ /// context.
+ unsigned char ClassifyBlockAddressReference() const;
+
+ /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+ /// to immediate address.
+ bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
+
+ /// This function returns true if the target has sincos() routine in its
+ /// compiler runtime or math libraries.
+ bool hasSinCos() const;
+
+ /// enablePostRAScheduler - run for Atom optimization.
+ bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+ TargetSubtargetInfo::AntiDepBreakMode& Mode,
+ RegClassVector& CriticalPathRCs) const;
+
+ bool postRAScheduler() const { return PostRAScheduler; }
+
+ /// getInstrItins = Return the instruction itineraries based on the
+ /// subtarget selection.
+ const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..00fa47f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,232 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeX86Target() {
+ // Register the target.
+ RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
+ RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
+}
+
+void X86_32TargetMachine::anchor() { }
+
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false),
+ DL(getSubtargetImpl()->isTargetDarwin() ?
+ "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
+ "n8:16:32-S128" :
+ (getSubtargetImpl()->isTargetCygMing() ||
+ getSubtargetImpl()->isTargetWindows()) ?
+ "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-"
+ "n8:16:32-S32" :
+ "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
+ "n8:16:32-S128"),
+ InstrInfo(*this),
+ TLInfo(*this),
+ TSInfo(*this),
+ JITInfo(*this) {
+}
+
+void X86_64TargetMachine::anchor() { }
+
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
+ // The x32 ABI dictates the ILP32 programming model for x64.
+ DL(getSubtargetImpl()->isTarget64BitILP32() ?
+ "e-p:32:32-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+ "n8:16:32:64-S128" :
+ "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+ "n8:16:32:64-S128"),
+ InstrInfo(*this),
+ TLInfo(*this),
+ TSInfo(*this),
+ JITInfo(*this) {
+}
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL,
+ bool is64Bit)
+ : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+ Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
+ FrameLowering(*this, Subtarget),
+ InstrItins(Subtarget.getInstrItineraryData()){
+ // Determine the PICStyle based on the target selected.
+ if (getRelocationModel() == Reloc::Static) {
+ // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+ Subtarget.setPICStyle(PICStyles::None);
+ } else if (Subtarget.is64Bit()) {
+ // PIC in 64 bit mode is always rip-rel.
+ Subtarget.setPICStyle(PICStyles::RIPRel);
+ } else if (Subtarget.isTargetCygMing()) {
+ Subtarget.setPICStyle(PICStyles::None);
+ } else if (Subtarget.isTargetDarwin()) {
+ if (getRelocationModel() == Reloc::PIC_)
+ Subtarget.setPICStyle(PICStyles::StubPIC);
+ else {
+ assert(getRelocationModel() == Reloc::DynamicNoPIC);
+ Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
+ }
+ } else if (Subtarget.isTargetELF()) {
+ Subtarget.setPICStyle(PICStyles::GOT);
+ }
+
+ // default to hard float ABI
+ if (Options.FloatABIType == FloatABI::Default)
+ this->Options.FloatABIType = FloatABI::Hard;
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper",
+ cl::desc("Minimize AVX to SSE transition penalty"),
+ cl::init(true));
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt",
+ cl::desc("Enable early if-conversion on X86"));
+
+//===----------------------------------------------------------------------===//
+// X86 Analysis Pass Setup
+//===----------------------------------------------------------------------===//
+
+void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+ // Add first the target-independent BasicTTI pass, then our X86 pass. This
+ // allows the X86 pass to delegate to the target independent layer when
+ // appropriate.
+ PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+ PM.add(createX86TargetTransformInfoPass(this));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+ X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ X86TargetMachine &getX86TargetMachine() const {
+ return getTM<X86TargetMachine>();
+ }
+
+ const X86Subtarget &getX86Subtarget() const {
+ return *getX86TargetMachine().getSubtargetImpl();
+ }
+
+ virtual bool addInstSelector();
+ virtual bool addILPOpts();
+ virtual bool addPreRegAlloc();
+ virtual bool addPostRegAlloc();
+ virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new X86PassConfig(this, PM);
+}
+
+bool X86PassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+ // For ELF, cleanup any local-dynamic TLS accesses.
+ if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+ addPass(createCleanupLocalDynamicTLSPass());
+
+ // For 32-bit, prepend instructions to set the "global base reg" for PIC.
+ if (!getX86Subtarget().is64Bit())
+ addPass(createGlobalBaseRegPass());
+
+ return false;
+}
+
+bool X86PassConfig::addILPOpts() {
+ if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
+ addPass(&EarlyIfConverterID);
+ return true;
+ }
+ return false;
+}
+
+bool X86PassConfig::addPreRegAlloc() {
+ return false; // -print-machineinstr shouldn't print after this.
+}
+
+bool X86PassConfig::addPostRegAlloc() {
+ addPass(createX86FloatingPointStackifierPass());
+ return true; // -print-machineinstr should print after this.
+}
+
+bool X86PassConfig::addPreEmitPass() {
+ bool ShouldPrint = false;
+ if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
+ addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
+ ShouldPrint = true;
+ }
+
+ if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
+ addPass(createX86IssueVZeroUpperPass());
+ ShouldPrint = true;
+ }
+
+ if (getOptLevel() != CodeGenOpt::None &&
+ getX86Subtarget().padShortFunctions()) {
+ addPass(createX86PadShortFunctions());
+ ShouldPrint = true;
+ }
+ if (getOptLevel() != CodeGenOpt::None &&
+ getX86Subtarget().LEAusesAG()){
+ addPass(createX86FixupLEAs());
+ ShouldPrint = true;
+ }
+
+ return ShouldPrint;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+ JITCodeEmitter &JCE) {
+ PM.add(createX86JITCodeEmitterPass(*this, JCE));
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..174d391
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,137 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine : public LLVMTargetMachine {
+ X86Subtarget Subtarget;
+ X86FrameLowering FrameLowering;
+ InstrItineraryData InstrItins;
+
+public:
+ X86TargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS, const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL,
+ bool is64Bit);
+
+ virtual const X86InstrInfo *getInstrInfo() const {
+ llvm_unreachable("getInstrInfo not implemented");
+ }
+ virtual const TargetFrameLowering *getFrameLowering() const {
+ return &FrameLowering;
+ }
+ virtual X86JITInfo *getJITInfo() {
+ llvm_unreachable("getJITInfo not implemented");
+ }
+ virtual const X86Subtarget *getSubtargetImpl() const{ return &Subtarget; }
+ virtual const X86TargetLowering *getTargetLowering() const {
+ llvm_unreachable("getTargetLowering not implemented");
+ }
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ llvm_unreachable("getSelectionDAGInfo not implemented");
+ }
+ virtual const X86RegisterInfo *getRegisterInfo() const {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+ virtual const InstrItineraryData *getInstrItineraryData() const {
+ return &InstrItins;
+ }
+
+ /// \brief Register X86 analysis passes with a pass manager.
+ virtual void addAnalysisPasses(PassManagerBase &PM);
+
+ // Set up the pass pipeline.
+ virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+ virtual bool addCodeEmitter(PassManagerBase &PM,
+ JITCodeEmitter &JCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+ virtual void anchor();
+ const DataLayout DL; // Calculates type size & alignment
+ X86InstrInfo InstrInfo;
+ X86TargetLowering TLInfo;
+ X86SelectionDAGInfo TSInfo;
+ X86JITInfo JITInfo;
+public:
+ X86_32TargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS, const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ virtual const DataLayout *getDataLayout() const { return &DL; }
+ virtual const X86TargetLowering *getTargetLowering() const {
+ return &TLInfo;
+ }
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ return &TSInfo;
+ }
+ virtual const X86InstrInfo *getInstrInfo() const {
+ return &InstrInfo;
+ }
+ virtual X86JITInfo *getJITInfo() {
+ return &JITInfo;
+ }
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+ virtual void anchor();
+ const DataLayout DL; // Calculates type size & alignment
+ X86InstrInfo InstrInfo;
+ X86TargetLowering TLInfo;
+ X86SelectionDAGInfo TSInfo;
+ X86JITInfo JITInfo;
+public:
+ X86_64TargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS, const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ virtual const DataLayout *getDataLayout() const { return &DL; }
+ virtual const X86TargetLowering *getTargetLowering() const {
+ return &TLInfo;
+ }
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ return &TSInfo;
+ }
+ virtual const X86InstrInfo *getInstrInfo() const {
+ return &InstrInfo;
+ }
+ virtual X86JITInfo *getJITInfo() {
+ return &JITInfo;
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..871dacd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,49 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::
+getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+ MachineModuleInfo *MMI, unsigned Encoding,
+ MCStreamer &Streamer) const {
+
+ // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+ // is an indirect pc-relative reference.
+ if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = Mang->getSymbol(GV);
+ const MCExpr *Res =
+ MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Four = MCConstantExpr::Create(4, getContext());
+ return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::
+ getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+ MachineModuleInfo *MMI) const {
+ return Mang->getSymbol(GV);
+}
+
+void
+X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..9d26d38
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,43 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+ /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+ /// x86-64.
+ class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+ public:
+ virtual const MCExpr *
+ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+ MachineModuleInfo *MMI, unsigned Encoding,
+ MCStreamer &Streamer) const;
+
+ // getCFIPersonalitySymbol - The symbol that gets passed to
+ // .cfi_personality.
+ virtual MCSymbol *
+ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+ MachineModuleInfo *MMI) const;
+ };
+
+ /// X86LinuxTargetObjectFile - This implementation is used for linux x86
+ /// and x86-64.
+ class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+ virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+ };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 0000000..eba9d78
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,530 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86tti"
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeX86TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class X86TTI : public ImmutablePass, public TargetTransformInfo {
+ const X86TargetMachine *TM;
+ const X86Subtarget *ST;
+ const X86TargetLowering *TLI;
+
+ /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+ /// are set if the result needs to be inserted and/or extracted from vectors.
+ unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+ X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+ llvm_unreachable("This pass cannot be directly constructed");
+ }
+
+ X86TTI(const X86TargetMachine *TM)
+ : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+ TLI(TM->getTargetLowering()) {
+ initializeX86TTIPass(*PassRegistry::getPassRegistry());
+ }
+
+ virtual void initializePass() {
+ pushTTIStack(this);
+ }
+
+ virtual void finalizePass() {
+ popTTIStack();
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ TargetTransformInfo::getAnalysisUsage(AU);
+ }
+
+ /// Pass identification.
+ static char ID;
+
+ /// Provide necessary pointer adjustments for the two base classes.
+ virtual void *getAdjustedAnalysisPointer(const void *ID) {
+ if (ID == &TargetTransformInfo::ID)
+ return (TargetTransformInfo*)this;
+ return this;
+ }
+
+ /// \name Scalar TTI Implementations
+ /// @{
+ virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ virtual unsigned getNumberOfRegisters(bool Vector) const;
+ virtual unsigned getRegisterBitWidth(bool Vector) const;
+ virtual unsigned getMaximumUnrollFactor() const;
+ virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ OperandValueKind,
+ OperandValueKind) const;
+ virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+ int Index, Type *SubTp) const;
+ virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src) const;
+ virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) const;
+ virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const;
+ virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment,
+ unsigned AddressSpace) const;
+
+ /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
+ "X86 Target Transform Info", true, true, false)
+char X86TTI::ID = 0;
+
+ImmutablePass *
+llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
+ return new X86TTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ // TODO: Currently the __builtin_popcount() implementation using SSE3
+ // instructions is inefficient. Once the problem is fixed, we should
+ // call ST->hasSSE3() instead of ST->hasSSE4().
+ return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
+}
+
+unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
+ if (Vector && !ST->hasSSE1())
+ return 0;
+
+ if (ST->is64Bit())
+ return 16;
+ return 8;
+}
+
+unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
+ if (Vector) {
+ if (ST->hasAVX()) return 256;
+ if (ST->hasSSE1()) return 128;
+ return 0;
+ }
+
+ if (ST->is64Bit())
+ return 64;
+ return 32;
+
+}
+
+unsigned X86TTI::getMaximumUnrollFactor() const {
+ if (ST->isAtom())
+ return 1;
+
+ // Sandybridge and Haswell have multiple execution ports and pipelined
+ // vector units.
+ if (ST->hasAVX())
+ return 4;
+
+ return 2;
+}
+
+unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ OperandValueKind Op1Info,
+ OperandValueKind Op2Info) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry<MVT> AVX2CostTable[] = {
+ // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // customize them to detect the cases where shift amount is a scalar one.
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
+
+ { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence.
+ { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized.
+
+ { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized.
+ { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized.
+
+ { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
+ { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized.
+ { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
+ };
+
+ // Look for AVX2 lowering tricks.
+ if (ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX2CostTable[Idx].Cost;
+ }
+
+ static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ // Constant splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i8, 1 }, // psllw.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+
+ { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2()) {
+ int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
+ array_lengthof(SSE2UniformConstCostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * SSE2UniformConstCostTable[Idx].Cost;
+ }
+
+
+ static const CostTblEntry<MVT> SSE2CostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ // For some cases, where the shift amount is a scalar we would be able
+ // to generate better code. Unfortunately, when this is the case the value
+ // (the splat) will get hoisted out of the loop, thereby making it invisible
+ // to ISel. The cost model must return worst case assumptions because it is
+ // used for vectorization and we don't want to make vectorized code worse
+ // than scalar code.
+ { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence.
+ { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
+
+ { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
+ { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
+ { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized.
+ { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized.
+
+ { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized.
+ { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized.
+ { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized.
+ { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
+ };
+
+ if (ST->hasSSE2()) {
+ int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * SSE2CostTable[Idx].Cost;
+ }
+
+ static const CostTblEntry<MVT> AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // split factor of two in the cost table. Therefore, the cost here is 18
+ // instead of 9.
+ { ISD::MUL, MVT::v4i64, 18 },
+ };
+
+ // Look for AVX1 lowering tricks.
+ if (ST->hasAVX() && !ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX1CostTable[Idx].Cost;
+ }
+
+ // Custom lowering of vectors.
+ static const CostTblEntry<MVT> CustomLowered[] = {
+ // A v2i64/v4i64 and multiply is custom lowered as a series of long
+ // multiplies(3), shifts(4) and adds(2).
+ { ISD::MUL, MVT::v2i64, 9 },
+ { ISD::MUL, MVT::v4i64, 9 },
+ };
+ int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * CustomLowered[Idx].Cost;
+
+ // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+ // 2x pmuludq, 2x shuffle.
+ if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+ !ST->hasSSE41())
+ return 6;
+
+ // Fallback to the default implementation.
+ return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
+ Op2Info);
+}
+
+unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) const {
+ // We only estimate the cost of reverse shuffles.
+ if (Kind != SK_Reverse)
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ unsigned Cost = 1;
+ if (LT.second.getSizeInBits() > 128)
+ Cost = 3; // Extract + insert + copy.
+
+ // Multiple by the number of parts.
+ return Cost * LT.first;
+}
+
+unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
+ std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
+
+ static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = {
+ // These are somewhat magic numbers justified by looking at the output of
+ // Intel's IACA, running some kernels and making sure when we take
+ // legalization into account the throughput will be overestimated.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ // There are faster sequences for float conversions.
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ };
+
+ if (ST->hasSSE2() && !ST->hasAVX()) {
+ int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl,
+ array_lengthof(SSE2ConvTbl),
+ ISD, LTDest.second, LTSrc.second);
+ if (Idx != -1)
+ return LTSrc.first * SSE2ConvTbl[Idx].Cost;
+ }
+
+ EVT SrcTy = TLI->getValueType(Src);
+ EVT DstTy = TLI->getValueType(Dst);
+
+ // The function getSimpleVT only handles simple value types.
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+ static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
+ };
+
+ if (ST->hasAVX()) {
+ int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
+ array_lengthof(AVXConversionTbl),
+ ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+ if (Idx != -1)
+ return AVXConversionTbl[Idx].Cost;
+ }
+
+ return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry<MVT> SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+ };
+
+ static const CostTblEntry<MVT> AVX1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f64, 1 },
+ { ISD::SETCC, MVT::v8f32, 1 },
+ // AVX1 does not support 8-wide integer compare.
+ { ISD::SETCC, MVT::v4i64, 4 },
+ { ISD::SETCC, MVT::v8i32, 4 },
+ { ISD::SETCC, MVT::v16i16, 4 },
+ { ISD::SETCC, MVT::v32i8, 4 },
+ };
+
+ static const CostTblEntry<MVT> AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+ };
+
+ if (ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * AVX2CostTbl[Idx].Cost;
+ }
+
+ if (ST->hasAVX()) {
+ int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * AVX1CostTbl[Idx].Cost;
+ }
+
+ if (ST->hasSSE42()) {
+ int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * SSE42CostTbl[Idx].Cost;
+ }
+
+ return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // Floating point scalars are already located in index #0.
+ if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+ return 0;
+ }
+
+ return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ // Each load/store unit costs 1.
+ unsigned Cost = LT.first * 1;
+
+ // On Sandybridge 256bit load/stores are double pumped
+ // (but not on Haswell).
+ if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
+ Cost*=2;
+
+ return Cost;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000..0f77948
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,293 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when tranfering control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-vzeroupper"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+ struct VZeroUpperInserter : public MachineFunctionPass {
+ static char ID;
+ VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
+
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+
+ // Any YMM register live-in to this function?
+ bool FnHasLiveInYmm;
+
+ // BBState - Contains the state of each MBB: unknown, clean, dirty
+ SmallVector<uint8_t, 8> BBState;
+
+ // BBSolved - Keep track of all MBB which had been already analyzed
+ // and there is no further processing required.
+ BitVector BBSolved;
+
+ // Machine Basic Blocks are classified according this pass:
+ //
+ // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state
+ // until the MBB exit there isn't a instruction using YMM to change
+ // the state to dirty, or one of the incoming predecessors is unknown
+ // and there's not a dirty predecessor between them.
+ //
+ // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have
+ // instructions using YMM and be marked ST_CLEAN, as long as the state
+ // is cleaned by a vzeroupper before any call.
+ //
+ // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a
+ // vzeroupper instruction.
+ //
+ // ST_INIT - Placeholder for an empty state set
+ //
+ enum {
+ ST_UNKNOWN = 0,
+ ST_CLEAN = 1,
+ ST_DIRTY = 2,
+ ST_INIT = 3
+ };
+
+ // computeState - Given two states, compute the resulting state, in
+ // the following way
+ //
+ // 1) One dirty state yields another dirty state
+ // 2) All states must be clean for the result to be clean
+ // 3) If none above and one unknown, the result state is also unknown
+ //
+ static unsigned computeState(unsigned PrevState, unsigned CurState) {
+ if (PrevState == ST_INIT)
+ return CurState;
+
+ if (PrevState == ST_DIRTY || CurState == ST_DIRTY)
+ return ST_DIRTY;
+
+ if (PrevState == ST_CLEAN && CurState == ST_CLEAN)
+ return ST_CLEAN;
+
+ return ST_UNKNOWN;
+ }
+
+ };
+ char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+ return new VZeroUpperInserter();
+}
+
+static bool isYmmReg(unsigned Reg) {
+ if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
+ return true;
+
+ return false;
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+ for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+ E = MRI.livein_end(); I != E; ++I)
+ if (isYmmReg(I->first))
+ return true;
+
+ return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+ for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ return true;
+}
+
+static bool hasYmmReg(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+ return true;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDebug())
+ continue;
+ if (isYmmReg(MO.getReg()))
+ return true;
+ }
+ return false;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getTarget().getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool EverMadeChange = false;
+
+ // Fast check: if the function doesn't use any ymm registers, we don't need
+ // to insert any VZEROUPPER instructions. This is constant-time, so it is
+ // cheap in the common case of no ymm use.
+ bool YMMUsed = false;
+ const TargetRegisterClass *RC = &X86::VR256RegClass;
+ for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
+ i != e; i++) {
+ if (!MRI.reg_nodbg_empty(*i)) {
+ YMMUsed = true;
+ break;
+ }
+ }
+ if (!YMMUsed)
+ return EverMadeChange;
+
+ // Pre-compute the existence of any live-in YMM registers to this function
+ FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+ assert(BBState.empty());
+ BBState.resize(MF.getNumBlockIDs(), 0);
+ BBSolved.resize(MF.getNumBlockIDs(), 0);
+
+ // Each BB state depends on all predecessors, loop over until everything
+ // converges. (Once we converge, we can implicitly mark everything that is
+ // still ST_UNKNOWN as ST_CLEAN.)
+ while (1) {
+ bool MadeChange = false;
+
+ // Process all basic blocks.
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+ MadeChange |= processBasicBlock(MF, *I);
+
+ // If this iteration over the code changed anything, keep iterating.
+ if (!MadeChange) break;
+ EverMadeChange = true;
+ }
+
+ BBState.clear();
+ BBSolved.clear();
+ return EverMadeChange;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzero upper instructions before function calls.
+bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+ MachineBasicBlock &BB) {
+ bool Changed = false;
+ unsigned BBNum = BB.getNumber();
+
+ // Don't process already solved BBs
+ if (BBSolved[BBNum])
+ return false; // No changes
+
+ // Check the state of all predecessors
+ unsigned EntryState = ST_INIT;
+ for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(),
+ PE = BB.pred_end(); PI != PE; ++PI) {
+ EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]);
+ if (EntryState == ST_DIRTY)
+ break;
+ }
+
+
+ // The entry MBB for the function may set the initial state to dirty if
+ // the function receives any YMM incoming arguments
+ if (&BB == MF.begin()) {
+ EntryState = ST_CLEAN;
+ if (FnHasLiveInYmm)
+ EntryState = ST_DIRTY;
+ }
+
+ // The current state is initialized according to the predecessors
+ unsigned CurState = EntryState;
+ bool BBHasCall = false;
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr *MI = I;
+ DebugLoc dl = I->getDebugLoc();
+ bool isControlFlow = MI->isCall() || MI->isReturn();
+
+ // Shortcut: don't need to check regular instructions in dirty state.
+ if (!isControlFlow && CurState == ST_DIRTY)
+ continue;
+
+ if (hasYmmReg(MI)) {
+ // We found a ymm-using instruction; this could be an AVX instruction,
+ // or it could be control flow.
+ CurState = ST_DIRTY;
+ continue;
+ }
+
+ // Check for control-flow out of the current function (which might
+ // indirectly execute SSE instructions).
+ if (!isControlFlow)
+ continue;
+
+ BBHasCall = true;
+
+ // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+ // registers. This instruction has zero latency. In addition, the processor
+ // changes back to Clean state, after which execution of Intel SSE
+ // instructions or Intel AVX instructions has no transition penalty. Add
+ // the VZEROUPPER instruction before any function call/return that might
+ // execute SSE code.
+ // FIXME: In some cases, we may want to move the VZEROUPPER into a
+ // predecessor block.
+ if (CurState == ST_DIRTY) {
+ // Only insert the VZEROUPPER in case the entry state isn't unknown.
+ // When unknown, only compute the information within the block to have
+ // it available in the exit if possible, but don't change the block.
+ if (EntryState != ST_UNKNOWN) {
+ BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
+ ++NumVZU;
+ }
+
+ // After the inserted VZEROUPPER the state becomes clean again, but
+ // other YMM may appear before other subsequent calls or even before
+ // the end of the BB.
+ CurState = ST_CLEAN;
+ }
+ }
+
+ DEBUG(dbgs() << "MBB #" << BBNum
+ << ", current state: " << CurState << '\n');
+
+ // A BB can only be considered solved when we both have done all the
+ // necessary transformations, and have computed the exit state. This happens
+ // in two cases:
+ // 1) We know the entry state: this immediately implies the exit state and
+ // all the necessary transformations.
+ // 2) There are no calls, and and a non-call instruction marks this block:
+ // no transformations are necessary, and we know the exit state.
+ if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN))
+ BBSolved[BBNum] = true;
+
+ if (CurState != BBState[BBNum])
+ Changed = true;
+
+ BBState[BBNum] = CurState;
+ return Changed;
+}
OpenPOWER on IntegriCloud